diff --git a/.github/workflows/helm.yaml b/.github/workflows/helm.yaml new file mode 100644 index 000000000..1f1a9fb74 --- /dev/null +++ b/.github/workflows/helm.yaml @@ -0,0 +1,54 @@ +--- +name: helm + +on: + pull_request: + branches: [main] + paths: + - 'deployments/gpu-operator/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +env: + CHART_NAME: gpu-operator + CHART_DIR: deployments + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Set up chart-testing + uses: helm/chart-testing-action@6ec842c01de15ebb84c8627d2744a0c2f2755c9f # v2.8.0 + + - name: Run chart-testing (lint) + run: | + ct lint \ + --target-branch ${{ github.event.repository.default_branch }} \ + --all \ + --validate-maintainers=false \ + --chart-dirs ${{ env.CHART_DIR }} + + unittest: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Set up Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5 + + - name: Install helm-unittest plugin + run: | + echo "Installing helm-unittest plugin..." + helm plugin install https://github.com/helm-unittest/helm-unittest.git --verify=false --version v1.0.3 # 6f82a998e0b5461762ca959f87f5dd344af5e4eb + + - name: Run helm unittest + run: helm unittest ${{ env.CHART_DIR }}/${{ env.CHART_NAME }} diff --git a/deployments/gpu-operator/.helmignore b/deployments/gpu-operator/.helmignore index 50af03172..8f42159b6 100644 --- a/deployments/gpu-operator/.helmignore +++ b/deployments/gpu-operator/.helmignore @@ -20,3 +20,5 @@ .idea/ *.tmproj .vscode/ +# helm unittest +tests/ diff --git a/deployments/gpu-operator/tests/__snapshot__/cleanup_crd_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/cleanup_crd_test.yaml.snap new file mode 100644 index 000000000..e8fadaa7e --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/cleanup_crd_test.yaml.snap @@ -0,0 +1,48 @@ +should match snapshot: + 1: | + apiVersion: batch/v1 + kind: Job + metadata: + annotations: + helm.sh/hook: pre-delete + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "1" + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator-cleanup-crd + namespace: gpu-operator + spec: + template: + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator-cleanup-crd + spec: + containers: + - args: + - delete + - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml + - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml + - --filepath=/opt/gpu-operator/nfd-api-crds.yaml + command: + - /usr/bin/manage-crds + image: nvcr.io/nvidia/gpu-operator:v25.0.0 + imagePullPolicy: IfNotPresent + name: cleanup-crd + restartPolicy: OnFailure + serviceAccountName: gpu-operator + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Equal + value: "" diff --git a/deployments/gpu-operator/tests/__snapshot__/clusterpolicy_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/clusterpolicy_test.yaml.snap new file mode 100644 index 000000000..1f4db97b7 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/clusterpolicy_test.yaml.snap @@ -0,0 +1,436 @@ +should match snapshot: + 1: | + apiVersion: nvidia.com/v1 + kind: ClusterPolicy + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + name: cluster-policy + spec: + ccManager: + defaultMode: "on" + enabled: true + image: k8s-cc-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.4.0 + cdi: + enabled: true + daemonsets: + labels: + app.kubernetes.io/managed-by: gpu-operator + helm.sh/chart: gpu-operator-v1.0.0-devel + priorityClassName: system-node-critical + rollingUpdate: + maxUnavailable: "1" + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + updateStrategy: RollingUpdate + dcgm: + enabled: false + image: dcgm + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: 4.5.2-1-ubuntu22.04 + dcgmExporter: + enabled: true + image: dcgm-exporter + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/k8s + service: + internalTrafficPolicy: Cluster + serviceMonitor: + additionalLabels: {} + enabled: false + honorLabels: false + interval: 15s + relabelings: [] + scrapeTimeout: 10s + version: 4.5.1-4.8.0-distroless + devicePlugin: + enabled: true + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v0.19.1 + driver: + certConfig: + name: "" + enabled: true + image: driver + imagePullPolicy: IfNotPresent + kernelModuleConfig: + name: "" + kernelModuleType: auto + licensingConfig: + nlsEnabled: true + secretName: "" + manager: + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + rdma: + enabled: false + useHostMofed: false + repoConfig: + configMapName: "" + repository: nvcr.io/nvidia + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + useNvidiaDriverCRD: false + usePrecompiled: false + version: 595.58.03 + virtualTopology: + config: "" + gdrcopy: + enabled: false + image: gdrdrv + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v2.5.2 + gds: + enabled: false + image: nvidia-fs + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: 2.27.3 + gfd: + enabled: true + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v0.19.1 + hostPaths: + driverInstallDir: /run/nvidia/driver + rootFS: / + kataManager: + enabled: false + imagePullPolicy: IfNotPresent + kataSandboxDevicePlugin: + enabled: true + image: nvidia-sandbox-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.0.3 + mig: + strategy: single + migManager: + config: + default: all-disabled + enabled: true + gpuClientsConfig: + name: "" + image: k8s-mig-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.14.0 + nodeStatusExporter: + enabled: false + image: gpu-operator + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v25.0.0 + operator: + runtimeClass: nvidia + psa: + enabled: false + sandboxDevicePlugin: + enabled: true + image: kubevirt-gpu-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v1.5.0 + sandboxWorkloads: + defaultWorkload: container + enabled: false + mode: kubevirt + toolkit: + enabled: true + image: container-toolkit + imagePullPolicy: IfNotPresent + installDir: /usr/local/nvidia + repository: nvcr.io/nvidia/k8s + version: v1.19.0 + validator: + image: gpu-operator + imagePullPolicy: IfNotPresent + plugin: + env: [] + repository: nvcr.io/nvidia + version: v25.0.0 + vfioManager: + driverManager: + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + enabled: true + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + vgpuDeviceManager: + config: + default: default + name: "" + enabled: true + image: vgpu-device-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.4.2 + vgpuManager: + driverManager: + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + enabled: false + image: vgpu-manager + imagePullPolicy: IfNotPresent + kernelModuleConfig: + name: "" +should match snapshot when cleanupCRD is enabled (resource-policy=keep): + 1: | + apiVersion: nvidia.com/v1 + kind: ClusterPolicy + metadata: + annotations: + helm.sh/resource-policy: keep + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + name: cluster-policy + spec: + ccManager: + defaultMode: "on" + enabled: true + image: k8s-cc-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.4.0 + cdi: + enabled: true + daemonsets: + labels: + app.kubernetes.io/managed-by: gpu-operator + helm.sh/chart: gpu-operator-v1.0.0-devel + priorityClassName: system-node-critical + rollingUpdate: + maxUnavailable: "1" + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + updateStrategy: RollingUpdate + dcgm: + enabled: false + image: dcgm + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: 4.5.2-1-ubuntu22.04 + dcgmExporter: + enabled: true + image: dcgm-exporter + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/k8s + service: + internalTrafficPolicy: Cluster + serviceMonitor: + additionalLabels: {} + enabled: false + honorLabels: false + interval: 15s + relabelings: [] + scrapeTimeout: 10s + version: 4.5.1-4.8.0-distroless + devicePlugin: + enabled: true + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v0.19.1 + driver: + certConfig: + name: "" + enabled: true + image: driver + imagePullPolicy: IfNotPresent + kernelModuleConfig: + name: "" + kernelModuleType: auto + licensingConfig: + nlsEnabled: true + secretName: "" + manager: + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + rdma: + enabled: false + useHostMofed: false + repoConfig: + configMapName: "" + repository: nvcr.io/nvidia + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + useNvidiaDriverCRD: false + usePrecompiled: false + version: 595.58.03 + virtualTopology: + config: "" + gdrcopy: + enabled: false + image: gdrdrv + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v2.5.2 + gds: + enabled: false + image: nvidia-fs + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: 2.27.3 + gfd: + enabled: true + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v0.19.1 + hostPaths: + driverInstallDir: /run/nvidia/driver + rootFS: / + kataManager: + enabled: false + imagePullPolicy: IfNotPresent + kataSandboxDevicePlugin: + enabled: true + image: nvidia-sandbox-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.0.3 + mig: + strategy: single + migManager: + config: + default: all-disabled + enabled: true + gpuClientsConfig: + name: "" + image: k8s-mig-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.14.0 + nodeStatusExporter: + enabled: false + image: gpu-operator + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v25.0.0 + operator: + runtimeClass: nvidia + psa: + enabled: false + sandboxDevicePlugin: + enabled: true + image: kubevirt-gpu-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v1.5.0 + sandboxWorkloads: + defaultWorkload: container + enabled: false + mode: kubevirt + toolkit: + enabled: true + image: container-toolkit + imagePullPolicy: IfNotPresent + installDir: /usr/local/nvidia + repository: nvcr.io/nvidia/k8s + version: v1.19.0 + validator: + image: gpu-operator + imagePullPolicy: IfNotPresent + plugin: + env: [] + repository: nvcr.io/nvidia + version: v25.0.0 + vfioManager: + driverManager: + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + enabled: true + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + vgpuDeviceManager: + config: + default: default + name: "" + enabled: true + image: vgpu-device-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.4.2 + vgpuManager: + driverManager: + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + enabled: false + image: vgpu-manager + imagePullPolicy: IfNotPresent + kernelModuleConfig: + name: "" diff --git a/deployments/gpu-operator/tests/__snapshot__/clusterrole_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/clusterrole_test.yaml.snap new file mode 100644 index 000000000..b1b913afa --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/clusterrole_test.yaml.snap @@ -0,0 +1,317 @@ +should match snapshot: + 1: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator + rules: + - apiGroups: + - config.openshift.io + resources: + - clusterversions + - proxies + verbs: + - get + - list + - watch + - apiGroups: + - image.openshift.io + resources: + - imagestreams + verbs: + - get + - list + - watch + - apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - use + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - watch + - update + - patch + - apiGroups: + - "" + resources: + - events + verbs: + - create + - get + - list + - watch + - delete + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch + - apiGroups: + - nvidia.com + resources: + - clusterpolicies + - clusterpolicies/finalizers + - clusterpolicies/status + - nvidiadrivers + - nvidiadrivers/finalizers + - nvidiadrivers/status + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - deletecollection + - apiGroups: + - scheduling.k8s.io + resources: + - priorityclasses + verbs: + - get + - list + - watch + - create + - apiGroups: + - node.k8s.io + resources: + - runtimeclasses + verbs: + - get + - list + - create + - update + - watch + - delete + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - update + - patch + - create +should match snapshot when cleanupCRD is enabled: + 1: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator + rules: + - apiGroups: + - config.openshift.io + resources: + - clusterversions + - proxies + verbs: + - get + - list + - watch + - apiGroups: + - image.openshift.io + resources: + - imagestreams + verbs: + - get + - list + - watch + - apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - use + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - watch + - update + - patch + - apiGroups: + - "" + resources: + - events + verbs: + - create + - get + - list + - watch + - delete + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch + - apiGroups: + - nvidia.com + resources: + - clusterpolicies + - clusterpolicies/finalizers + - clusterpolicies/status + - nvidiadrivers + - nvidiadrivers/finalizers + - nvidiadrivers/status + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - deletecollection + - apiGroups: + - scheduling.k8s.io + resources: + - priorityclasses + verbs: + - get + - list + - watch + - create + - apiGroups: + - node.k8s.io + resources: + - runtimeclasses + verbs: + - get + - list + - create + - update + - watch + - delete + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - update + - patch + - create + - delete diff --git a/deployments/gpu-operator/tests/__snapshot__/clusterrolebinding_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/clusterrolebinding_test.yaml.snap new file mode 100644 index 000000000..68f20d880 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/clusterrolebinding_test.yaml.snap @@ -0,0 +1,21 @@ +should match snapshot: + 1: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gpu-operator + subjects: + - kind: ServiceAccount + name: gpu-operator + namespace: gpu-operator diff --git a/deployments/gpu-operator/tests/__snapshot__/dcgm_exporter_config_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/dcgm_exporter_config_test.yaml.snap new file mode 100644 index 000000000..f7f077996 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/dcgm_exporter_config_test.yaml.snap @@ -0,0 +1,16 @@ +should match snapshot when create=true and data is provided: + 1: | + apiVersion: v1 + data: + dcgm-metrics.csv: | + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: custom-metrics + namespace: gpu-operator diff --git a/deployments/gpu-operator/tests/__snapshot__/extra_objects_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/extra_objects_test.yaml.snap new file mode 100644 index 000000000..198423ff9 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/extra_objects_test.yaml.snap @@ -0,0 +1,19 @@ +should match snapshot for an extra object provided as a structured map: + 1: | + apiVersion: v1 + data: + foo: bar + kind: ConfigMap + metadata: + name: extra-cm + namespace: gpu-operator +should match snapshot for an extra object provided as a templated string: + 1: | + apiVersion: v1 + kind: Secret + metadata: + name: extra-secret + namespace: gpu-operator + stringData: + release: gpu-operator + type: Opaque diff --git a/deployments/gpu-operator/tests/__snapshot__/mig_config_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/mig_config_test.yaml.snap new file mode 100644 index 000000000..277d95c55 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/mig_config_test.yaml.snap @@ -0,0 +1,20 @@ +should match snapshot when create=true and data is provided: + 1: | + apiVersion: v1 + data: + config.yaml: |- + version: v1 + mig-configs: + all-disabled: + - devices: all + mig-enabled: false + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: custom-mig-parted + namespace: gpu-operator diff --git a/deployments/gpu-operator/tests/__snapshot__/nodefeaturerules_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/nodefeaturerules_test.yaml.snap new file mode 100644 index 000000000..5a8da4c4a --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/nodefeaturerules_test.yaml.snap @@ -0,0 +1,32 @@ +should match snapshot when nodefeaturerules is enabled: + 1: | + apiVersion: nfd.k8s-sigs.io/v1alpha1 + kind: NodeFeatureRule + metadata: + name: nvidia-kernel-modules + spec: + rules: + - labels: + nvidia.com/gdrcopy.capable: "true" + matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + gdrdrv: + op: Exists + name: kernel-module-gdrdrv + - labels: + nvidia.com/gds.capable: "true" + matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + nvidia_fs: + op: Exists + name: kernel-module-nvidia_fs + - labels: + nvidia.com/peermem.capable: "true" + matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + nvidia_peermem: + op: Exists + name: kernel-module-nvidia_peermem diff --git a/deployments/gpu-operator/tests/__snapshot__/nvidiadriver_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/nvidiadriver_test.yaml.snap new file mode 100644 index 000000000..790e7ca1a --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/nvidiadriver_test.yaml.snap @@ -0,0 +1,126 @@ +should match snapshot when CRD mode is enabled with defaults: + 1: | + apiVersion: nvidia.com/v1alpha1 + kind: NVIDIADriver + metadata: + name: default + spec: + driverType: gpu + gdrcopy: + enabled: false + image: gdrdrv + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v2.5.2 + image: driver + kernelModuleType: auto + manager: + env: [] + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + rdma: + enabled: false + useHostMofed: false + repository: nvcr.io/nvidia + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + usePrecompiled: false + version: 595.58.03 +should match snapshot when gds and gdrcopy are enabled: + 1: | + apiVersion: nvidia.com/v1alpha1 + kind: NVIDIADriver + metadata: + name: default + spec: + driverType: gpu + gdrcopy: + enabled: true + image: gdrdrv + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v2.5.2 + gds: + enabled: true + image: nvidia-fs + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: 2.27.3 + image: driver + kernelModuleType: auto + manager: + env: [] + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + rdma: + enabled: false + useHostMofed: false + repository: nvcr.io/nvidia + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + usePrecompiled: false + version: 595.58.03 +should match snapshot with optional repoConfig, certConfig, kernelModuleConfig and licensing: + 1: | + apiVersion: nvidia.com/v1alpha1 + kind: NVIDIADriver + metadata: + name: default + spec: + certConfig: + name: my-cert + driverType: gpu + gdrcopy: + enabled: false + image: gdrdrv + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v2.5.2 + image: driver + kernelModuleConfig: + name: my-kmod + kernelModuleType: auto + licensingConfig: + nlsEnabled: true + secretName: my-secret + manager: + env: [] + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.10.0 + rdma: + enabled: false + useHostMofed: false + repoConfig: + name: my-repo + repository: nvcr.io/nvidia + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + usePrecompiled: false + version: 595.58.03 diff --git a/deployments/gpu-operator/tests/__snapshot__/operator_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/operator_test.yaml.snap new file mode 100644 index 000000000..a82b6a257 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/operator_test.yaml.snap @@ -0,0 +1,93 @@ +should match snapshot: + 1: | + apiVersion: apps/v1 + kind: Deployment + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + nvidia.com/gpu-driver-upgrade-drain.skip: "true" + name: gpu-operator + namespace: gpu-operator + spec: + replicas: 1 + selector: + matchLabels: + app: gpu-operator + app.kubernetes.io/component: gpu-operator + template: + metadata: + annotations: + openshift.io/scc: restricted-readonly + labels: + app: gpu-operator + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + nvidia.com/gpu-driver-upgrade-drain.skip: "true" + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: In + values: + - "" + weight: 1 + containers: + - args: + - --leader-elect + - --zap-time-encoding=epoch + - --zap-log-level=info + command: + - gpu-operator + env: + - name: WATCH_NAMESPACE + value: "" + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: DRIVER_MANAGER_IMAGE + value: nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.10.0 + image: nvcr.io/nvidia/gpu-operator:v25.0.0 + imagePullPolicy: IfNotPresent + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: gpu-operator + ports: + - containerPort: 8080 + name: metrics + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + priorityClassName: system-node-critical + serviceAccountName: gpu-operator + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Equal + value: "" diff --git a/deployments/gpu-operator/tests/__snapshot__/plugin_config_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/plugin_config_test.yaml.snap new file mode 100644 index 000000000..579c7f96e --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/plugin_config_test.yaml.snap @@ -0,0 +1,22 @@ +should match snapshot when create=true and data is provided: + 1: | + apiVersion: v1 + data: + default: |- + version: v1 + flags: + migStrategy: none + mig-single: |- + version: v1 + flags: + migStrategy: single + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: device-plugin-config + namespace: gpu-operator diff --git a/deployments/gpu-operator/tests/__snapshot__/readonlyfs_scc_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/readonlyfs_scc_test.yaml.snap new file mode 100644 index 000000000..cecaf597a --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/readonlyfs_scc_test.yaml.snap @@ -0,0 +1,50 @@ +should match snapshot on openshift: + 1: | + allowHostDirVolumePlugin: false + allowHostIPC: false + allowHostNetwork: false + allowHostPID: false + allowHostPorts: false + allowPrivilegeEscalation: true + allowPrivilegedContainer: false + allowedCapabilities: [] + apiVersion: security.openshift.io/v1 + defaultAddCapabilities: [] + fsGroup: + type: MustRunAs + groups: + - system:authenticated + kind: SecurityContextConstraints + metadata: + annotations: + kubernetes.io/description: restricted denies access to all host features and requires pods to be run with a UID, read-only root filesystem and SELinux context that are allocated to the namespace. This SCC is more restrictive than the default restrictive SCC and it is used by default for authenticated users and operators and operands. + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: restricted-readonly + priority: 0 + readOnlyRootFilesystem: true + requiredDropCapabilities: + - KILL + - MKNOD + - SETUID + - SETGID + runAsUser: + type: MustRunAsRange + seLinuxContext: + type: MustRunAs + supplementalGroups: + type: RunAsAny + users: + - system:serviceaccount:gpu-operator:gpu-operator + volumes: + - configMap + - downwardAPI + - emptyDir + - persistentVolumeClaim + - projected + - secret diff --git a/deployments/gpu-operator/tests/__snapshot__/role_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/role_test.yaml.snap new file mode 100644 index 000000000..2c844dea3 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/role_test.yaml.snap @@ -0,0 +1,101 @@ +should match snapshot: + 1: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator + namespace: gpu-operator + rules: + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - apps + resources: + - controllerrevisions + verbs: + - get + - list + - watch + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + - endpoints + - pods + - pods/eviction + - secrets + - services + - services/finalizers + - serviceaccounts + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + - prometheusrules + verbs: + - get + - list + - create + - watch + - update + - delete + - apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - get + - list + - watch + - create + - update diff --git a/deployments/gpu-operator/tests/__snapshot__/rolebinding_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/rolebinding_test.yaml.snap new file mode 100644 index 000000000..e86cfb34d --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/rolebinding_test.yaml.snap @@ -0,0 +1,22 @@ +should match snapshot: + 1: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator + namespace: gpu-operator + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: gpu-operator + subjects: + - kind: ServiceAccount + name: gpu-operator + namespace: gpu-operator diff --git a/deployments/gpu-operator/tests/__snapshot__/serviceaccount_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/serviceaccount_test.yaml.snap new file mode 100644 index 000000000..ab3498f2b --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/serviceaccount_test.yaml.snap @@ -0,0 +1,14 @@ +should match snapshot: + 1: | + apiVersion: v1 + kind: ServiceAccount + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: main-latest + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator + namespace: gpu-operator diff --git a/deployments/gpu-operator/tests/__snapshot__/upgrade_crd_test.yaml.snap b/deployments/gpu-operator/tests/__snapshot__/upgrade_crd_test.yaml.snap new file mode 100644 index 000000000..7a2c05884 --- /dev/null +++ b/deployments/gpu-operator/tests/__snapshot__/upgrade_crd_test.yaml.snap @@ -0,0 +1,96 @@ +should match snapshot: + 1: | + apiVersion: v1 + kind: ServiceAccount + metadata: + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "0" + name: gpu-operator-upgrade-crd-hook-sa + namespace: gpu-operator + 2: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "0" + name: gpu-operator-upgrade-crd-hook-role + rules: + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - create + - get + - list + - watch + - patch + - update + 3: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "0" + name: gpu-operator-upgrade-crd-hook-binding + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gpu-operator-upgrade-crd-hook-role + subjects: + - kind: ServiceAccount + name: gpu-operator-upgrade-crd-hook-sa + namespace: gpu-operator + 4: | + apiVersion: batch/v1 + kind: Job + metadata: + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "1" + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator-upgrade-crd + namespace: gpu-operator + spec: + template: + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.0.0 + helm.sh/chart: gpu-operator-v1.0.0-devel + name: gpu-operator-upgrade-crd + spec: + containers: + - args: + - apply + - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml + - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml + - --filepath=/opt/gpu-operator/nfd-api-crds.yaml + command: + - /usr/bin/manage-crds + image: nvcr.io/nvidia/gpu-operator:v25.0.0 + imagePullPolicy: IfNotPresent + name: upgrade-crd + restartPolicy: OnFailure + serviceAccountName: gpu-operator-upgrade-crd-hook-sa + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Equal + value: "" diff --git a/deployments/gpu-operator/tests/cleanup_crd_test.yaml b/deployments/gpu-operator/tests/cleanup_crd_test.yaml new file mode 100644 index 000000000..5e3d5bd31 --- /dev/null +++ b/deployments/gpu-operator/tests/cleanup_crd_test.yaml @@ -0,0 +1,43 @@ +suite: cleanup_crd job +templates: + - cleanup_crd.yaml +release: + name: gpu-operator + namespace: gpu-operator +chart: + appVersion: "v25.0.0" +tests: + - it: does not render the cleanup Job by default (cleanupCRD disabled) + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot + set: + operator: + cleanupCRD: true + asserts: + - matchSnapshot: {} + + - it: omits the nfd CRD filepath argument when nfd is disabled + set: + operator: + cleanupCRD: true + nfd: + enabled: false + asserts: + - notContains: + path: spec.template.spec.containers[0].args + content: --filepath=/opt/gpu-operator/nfd-api-crds.yaml + + - it: applies tolerations and image pull secrets to the cleanup Job + set: + operator: + cleanupCRD: true + imagePullSecrets: + - my-pull-secret + asserts: + - contains: + path: spec.template.spec.imagePullSecrets + content: + name: my-pull-secret diff --git a/deployments/gpu-operator/tests/clusterpolicy_test.yaml b/deployments/gpu-operator/tests/clusterpolicy_test.yaml new file mode 100644 index 000000000..743cc8e78 --- /dev/null +++ b/deployments/gpu-operator/tests/clusterpolicy_test.yaml @@ -0,0 +1,142 @@ +suite: clusterpolicy +templates: + - clusterpolicy.yaml +release: + name: gpu-operator + namespace: gpu-operator +chart: + appVersion: "v25.0.0" +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} + + - it: should match snapshot when cleanupCRD is enabled (resource-policy=keep) + set: + operator: + cleanupCRD: true + asserts: + - matchSnapshot: {} + + - it: includes kubeletRootDir when configured + set: + hostPaths: + kubeletRootDir: /var/lib/kubelet + asserts: + - equal: + path: spec.hostPaths.kubeletRootDir + value: /var/lib/kubelet + + - it: propagates the nri plugin flag only when CDI is also enabled + set: + cdi: + enabled: true + nriPluginEnabled: true + asserts: + - equal: + path: spec.cdi.nriPluginEnabled + value: true + + - it: prefers an explicit validator version when provided + set: + validator: + version: v9.9.9 + asserts: + - equal: + path: spec.validator.version + value: v9.9.9 + + - it: surfaces device plugin config when a name is provided + set: + devicePlugin: + config: + name: my-config + default: default + asserts: + - equal: + path: spec.devicePlugin.config.name + value: my-config + - equal: + path: spec.devicePlugin.config.default + value: default + + - it: surfaces dcgmExporter config name when provided + set: + dcgmExporter: + config: + name: custom-metrics + asserts: + - equal: + path: spec.dcgmExporter.config.name + value: custom-metrics + + - it: surfaces migManager config.name when provided + set: + migManager: + config: + name: custom-mig-parted + asserts: + - equal: + path: spec.migManager.config.name + value: custom-mig-parted + + - it: enables gds when toggled on + set: + gds: + enabled: true + asserts: + - equal: + path: spec.gds.enabled + value: true + - equal: + path: spec.gds.image + value: nvidia-fs + + - it: enables nodeStatusExporter when toggled on + set: + nodeStatusExporter: + enabled: true + asserts: + - equal: + path: spec.nodeStatusExporter.enabled + value: true + - equal: + path: spec.nodeStatusExporter.version + value: v25.0.0 + + - it: surfaces operator initContainer when configured + set: + operator: + initContainer: + repository: my-registry + image: my-init + version: v1.2.3 + imagePullPolicy: Always + asserts: + - equal: + path: spec.operator.initContainer.repository + value: my-registry + - equal: + path: spec.operator.initContainer.image + value: my-init + - equal: + path: spec.operator.initContainer.version + value: v1.2.3 + - equal: + path: spec.operator.initContainer.imagePullPolicy + value: Always + + - it: surfaces serviceMonitor settings on the operator section + set: + operator: + metrics: + serviceMonitor: + interval: 30s + honorLabels: true + asserts: + - equal: + path: spec.operator.metrics.serviceMonitor.interval + value: 30s + - equal: + path: spec.operator.metrics.serviceMonitor.honorLabels + value: true diff --git a/deployments/gpu-operator/tests/clusterrole_test.yaml b/deployments/gpu-operator/tests/clusterrole_test.yaml new file mode 100644 index 000000000..1af999a8a --- /dev/null +++ b/deployments/gpu-operator/tests/clusterrole_test.yaml @@ -0,0 +1,39 @@ +suite: clusterrole +templates: + - clusterrole.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} + + - it: should match snapshot when cleanupCRD is enabled + set: + operator: + cleanupCRD: true + asserts: + - matchSnapshot: {} + + + - it: grants delete on customresourcedefinitions when cleanupCRD is enabled + set: + operator: + cleanupCRD: true + asserts: + - contains: + path: rules + content: + apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - update + - patch + - create + - delete diff --git a/deployments/gpu-operator/tests/clusterrolebinding_test.yaml b/deployments/gpu-operator/tests/clusterrolebinding_test.yaml new file mode 100644 index 000000000..5884d50f2 --- /dev/null +++ b/deployments/gpu-operator/tests/clusterrolebinding_test.yaml @@ -0,0 +1,19 @@ +suite: clusterrolebinding +templates: + - clusterrolebinding.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} + + - it: places the subject ServiceAccount in the release namespace + release: + name: gpu-operator + namespace: another-ns + asserts: + - equal: + path: subjects[0].namespace + value: another-ns diff --git a/deployments/gpu-operator/tests/dcgm_exporter_config_test.yaml b/deployments/gpu-operator/tests/dcgm_exporter_config_test.yaml new file mode 100644 index 000000000..e215e1517 --- /dev/null +++ b/deployments/gpu-operator/tests/dcgm_exporter_config_test.yaml @@ -0,0 +1,32 @@ +suite: dcgm-exporter custom config +templates: + - dcgm_exporter_config.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: does not render the ConfigMap by default + asserts: + - hasDocuments: + count: 0 + + - it: does not render the ConfigMap when create is true but data is empty + set: + dcgmExporter: + config: + create: true + name: custom-metrics + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot when create=true and data is provided + set: + dcgmExporter: + config: + create: true + name: custom-metrics + data: |- + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + asserts: + - matchSnapshot: {} diff --git a/deployments/gpu-operator/tests/extra_objects_test.yaml b/deployments/gpu-operator/tests/extra_objects_test.yaml new file mode 100644 index 000000000..2786ea2cf --- /dev/null +++ b/deployments/gpu-operator/tests/extra_objects_test.yaml @@ -0,0 +1,54 @@ +suite: extraObjects +templates: + - extra-objects.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: renders no extra objects by default + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot for an extra object provided as a structured map + set: + extraObjects: + - apiVersion: v1 + kind: ConfigMap + metadata: + name: extra-cm + namespace: "{{ .Release.Namespace }}" + data: + foo: bar + asserts: + - matchSnapshot: {} + + - it: should match snapshot for an extra object provided as a templated string + set: + extraObjects: + - | + apiVersion: v1 + kind: Secret + metadata: + name: extra-secret + namespace: {{ .Release.Namespace }} + type: Opaque + stringData: + release: {{ .Release.Name }} + asserts: + - matchSnapshot: {} + + - it: renders multiple extra objects + set: + extraObjects: + - apiVersion: v1 + kind: ConfigMap + metadata: + name: extra-cm-1 + - apiVersion: v1 + kind: ConfigMap + metadata: + name: extra-cm-2 + asserts: + - hasDocuments: + count: 2 diff --git a/deployments/gpu-operator/tests/mig_config_test.yaml b/deployments/gpu-operator/tests/mig_config_test.yaml new file mode 100644 index 000000000..d70f166d0 --- /dev/null +++ b/deployments/gpu-operator/tests/mig_config_test.yaml @@ -0,0 +1,37 @@ +suite: mig-manager custom config +templates: + - mig_config.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: does not render the ConfigMap by default + asserts: + - hasDocuments: + count: 0 + + - it: does not render the ConfigMap when create=true but data is empty + set: + migManager: + config: + create: true + name: custom-mig-parted + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot when create=true and data is provided + set: + migManager: + config: + create: true + name: custom-mig-parted + data: + config.yaml: |- + version: v1 + mig-configs: + all-disabled: + - devices: all + mig-enabled: false + asserts: + - matchSnapshot: {} diff --git a/deployments/gpu-operator/tests/nodefeaturerules_test.yaml b/deployments/gpu-operator/tests/nodefeaturerules_test.yaml new file mode 100644 index 000000000..04f3ed026 --- /dev/null +++ b/deployments/gpu-operator/tests/nodefeaturerules_test.yaml @@ -0,0 +1,18 @@ +suite: nodefeaturerules +templates: + - nodefeaturerules.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: does not render NodeFeatureRule by default + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot when nodefeaturerules is enabled + set: + nfd: + nodefeaturerules: true + asserts: + - matchSnapshot: {} diff --git a/deployments/gpu-operator/tests/nvidiadriver_test.yaml b/deployments/gpu-operator/tests/nvidiadriver_test.yaml new file mode 100644 index 000000000..b9f33cf26 --- /dev/null +++ b/deployments/gpu-operator/tests/nvidiadriver_test.yaml @@ -0,0 +1,83 @@ +suite: nvidiadriver default CR +templates: + - nvidiadriver.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: does not render the NVIDIADriver CR by default (nvidiaDriverCRD disabled) + asserts: + - hasDocuments: + count: 0 + + - it: does not render when driver.enabled is false even if nvidiaDriverCRD is enabled + set: + driver: + enabled: false + nvidiaDriverCRD: + enabled: true + deployDefaultCR: true + asserts: + - hasDocuments: + count: 0 + + - it: does not render when nvidiaDriverCRD.deployDefaultCR is false + set: + driver: + nvidiaDriverCRD: + enabled: true + deployDefaultCR: false + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot when CRD mode is enabled with defaults + set: + driver: + nvidiaDriverCRD: + enabled: true + deployDefaultCR: true + asserts: + - matchSnapshot: {} + + - it: applies driverType override + set: + driver: + nvidiaDriverCRD: + enabled: true + deployDefaultCR: true + driverType: vgpu + asserts: + - equal: + path: spec.driverType + value: vgpu + + - it: should match snapshot with optional repoConfig, certConfig, kernelModuleConfig and licensing + set: + driver: + nvidiaDriverCRD: + enabled: true + deployDefaultCR: true + repoConfig: + configMapName: my-repo + certConfig: + name: my-cert + kernelModuleConfig: + name: my-kmod + licensingConfig: + secretName: my-secret + asserts: + - matchSnapshot: {} + + - it: should match snapshot when gds and gdrcopy are enabled + set: + driver: + nvidiaDriverCRD: + enabled: true + deployDefaultCR: true + gds: + enabled: true + gdrcopy: + enabled: true + asserts: + - matchSnapshot: {} diff --git a/deployments/gpu-operator/tests/operator_test.yaml b/deployments/gpu-operator/tests/operator_test.yaml new file mode 100644 index 000000000..3ff14b689 --- /dev/null +++ b/deployments/gpu-operator/tests/operator_test.yaml @@ -0,0 +1,61 @@ +suite: operator deployment +templates: + - operator.yaml +release: + name: gpu-operator + namespace: gpu-operator +chart: + appVersion: "v25.0.0" +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} + + - it: prefers operator.version over Chart.AppVersion when set + set: + operator: + version: v25.99.0 + asserts: + - equal: + path: spec.template.spec.containers[0].image + value: nvcr.io/nvidia/gpu-operator:v25.99.0 + + - it: switches to --zap-devel when develMode is enabled + set: + operator: + logging: + develMode: true + asserts: + - contains: + path: spec.template.spec.containers[0].args + content: --zap-devel + - notContains: + path: spec.template.spec.containers[0].args + content: --zap-time-encoding=epoch + - notContains: + path: spec.template.spec.containers[0].args + content: --zap-log-level=info + + - it: omits priorityClassName when explicitly cleared + set: + operator: + priorityClassName: "" + asserts: + - notExists: + path: spec.template.spec.priorityClassName + + - it: renders imagePullSecrets when configured + set: + operator: + imagePullSecrets: + - my-pull-secret + - other-secret + asserts: + - contains: + path: spec.template.spec.imagePullSecrets + content: + name: my-pull-secret + - contains: + path: spec.template.spec.imagePullSecrets + content: + name: other-secret diff --git a/deployments/gpu-operator/tests/plugin_config_test.yaml b/deployments/gpu-operator/tests/plugin_config_test.yaml new file mode 100644 index 000000000..8aed30107 --- /dev/null +++ b/deployments/gpu-operator/tests/plugin_config_test.yaml @@ -0,0 +1,40 @@ +suite: device-plugin custom config +templates: + - plugin_config.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: does not render the ConfigMap by default + asserts: + - hasDocuments: + count: 0 + + - it: does not render the ConfigMap when create=true but data is empty + set: + devicePlugin: + config: + create: true + name: device-plugin-config + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot when create=true and data is provided + set: + devicePlugin: + config: + create: true + name: device-plugin-config + default: default + data: + default: |- + version: v1 + flags: + migStrategy: none + mig-single: |- + version: v1 + flags: + migStrategy: single + asserts: + - matchSnapshot: {} diff --git a/deployments/gpu-operator/tests/readonlyfs_scc_test.yaml b/deployments/gpu-operator/tests/readonlyfs_scc_test.yaml new file mode 100644 index 000000000..7a1caf93f --- /dev/null +++ b/deployments/gpu-operator/tests/readonlyfs_scc_test.yaml @@ -0,0 +1,30 @@ +suite: openshift readonly-fs SCC +templates: + - readonlyfs_scc.openshift.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: does not render on non-openshift clusters + asserts: + - hasDocuments: + count: 0 + + - it: should match snapshot on openshift + set: + platform: + openshift: true + asserts: + - matchSnapshot: {} + + - it: scopes the SCC user to the release namespace + release: + name: gpu-operator + namespace: openshift-nvidia + set: + platform: + openshift: true + asserts: + - contains: + path: users + content: system:serviceaccount:openshift-nvidia:gpu-operator diff --git a/deployments/gpu-operator/tests/role_test.yaml b/deployments/gpu-operator/tests/role_test.yaml new file mode 100644 index 000000000..3609f75e7 --- /dev/null +++ b/deployments/gpu-operator/tests/role_test.yaml @@ -0,0 +1,10 @@ +suite: role +templates: + - role.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} diff --git a/deployments/gpu-operator/tests/rolebinding_test.yaml b/deployments/gpu-operator/tests/rolebinding_test.yaml new file mode 100644 index 000000000..21fc97ebf --- /dev/null +++ b/deployments/gpu-operator/tests/rolebinding_test.yaml @@ -0,0 +1,10 @@ +suite: rolebinding +templates: + - rolebinding.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} diff --git a/deployments/gpu-operator/tests/serviceaccount_test.yaml b/deployments/gpu-operator/tests/serviceaccount_test.yaml new file mode 100644 index 000000000..a6a783137 --- /dev/null +++ b/deployments/gpu-operator/tests/serviceaccount_test.yaml @@ -0,0 +1,19 @@ +suite: serviceaccount +templates: + - serviceaccount.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} + + - it: respects the release namespace override + release: + name: gpu-operator + namespace: custom-ns + asserts: + - equal: + path: metadata.namespace + value: custom-ns diff --git a/deployments/gpu-operator/tests/upgrade_crd_test.yaml b/deployments/gpu-operator/tests/upgrade_crd_test.yaml new file mode 100644 index 000000000..10f9e788f --- /dev/null +++ b/deployments/gpu-operator/tests/upgrade_crd_test.yaml @@ -0,0 +1,32 @@ +suite: upgrade_crd hook +templates: + - upgrade_crd.yaml +release: + name: gpu-operator + namespace: gpu-operator +chart: + appVersion: "v25.0.0" +tests: + - it: should match snapshot + asserts: + - matchSnapshot: {} + + - it: renders no documents when upgradeCRD is disabled + set: + operator: + upgradeCRD: false + asserts: + - hasDocuments: + count: 0 + + - it: omits the nfd CRD filepath when nfd is disabled + set: + nfd: + enabled: false + documentSelector: + path: kind + value: Job + asserts: + - notContains: + path: spec.template.spec.containers[0].args + content: --filepath=/opt/gpu-operator/nfd-api-crds.yaml diff --git a/deployments/gpu-operator/tests/validations_test.yaml b/deployments/gpu-operator/tests/validations_test.yaml new file mode 100644 index 000000000..b0e411655 --- /dev/null +++ b/deployments/gpu-operator/tests/validations_test.yaml @@ -0,0 +1,42 @@ +suite: input validations +templates: + - validations.yaml +release: + name: gpu-operator + namespace: gpu-operator +tests: + - it: renders nothing with the default values + asserts: + - hasDocuments: + count: 0 + + - it: fails when nri plugin is enabled but CDI is disabled + set: + cdi: + enabled: false + nriPluginEnabled: true + asserts: + - failedTemplate: + errorMessage: "the NRI Plugin cannot be enabled when CDI is disabled" + + - it: fails when nri plugin is enabled but the container toolkit is disabled + set: + cdi: + enabled: true + nriPluginEnabled: true + toolkit: + enabled: false + asserts: + - failedTemplate: + errorMessage: "the NRI Plugin cannot be enabled when the Container Toolkit is disabled" + + - it: succeeds when nri plugin is enabled with CDI and toolkit both enabled + set: + cdi: + enabled: true + nriPluginEnabled: true + toolkit: + enabled: true + asserts: + - hasDocuments: + count: 0 diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 9dba0c80c..c44eb9758 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -59,7 +59,7 @@ validator: repository: nvcr.io/nvidia image: gpu-operator # If version is not specified, then default is to use chart.AppVersion - #version: "" + # version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] @@ -73,7 +73,7 @@ operator: repository: nvcr.io/nvidia image: gpu-operator # If version is not specified, then default is to use chart.AppVersion - #version: "" + # version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] priorityClassName: system-node-critical @@ -423,7 +423,7 @@ nodeStatusExporter: repository: nvcr.io/nvidia image: gpu-operator # If version is not specified, then default is to use chart.AppVersion - #version: "" + # version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] resources: {}