diff --git a/demo/clusters/kind/create-cluster.sh b/demo/clusters/kind/create-cluster.sh new file mode 100755 index 000000000..6eb82a3f6 --- /dev/null +++ b/demo/clusters/kind/create-cluster.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" + +set -ex +set -o pipefail + +source "${CURRENT_DIR}/scripts/common.sh" + +# Build the kind image and create a test cluster +${SCRIPTS_DIR}/build-kind-image.sh +${SCRIPTS_DIR}/create-kind-cluster.sh + +set +x +printf '\033[0;32m' +echo "Cluster creation complete: ${KIND_CLUSTER_NAME}" +printf '\033[0m' diff --git a/demo/clusters/kind/delete-cluster.sh b/demo/clusters/kind/delete-cluster.sh new file mode 100755 index 000000000..066f5606b --- /dev/null +++ b/demo/clusters/kind/delete-cluster.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" + +set -ex +set -o pipefail + +source "${CURRENT_DIR}/scripts/common.sh" + +# Delete the test cluster +${SCRIPTS_DIR}/delete-kind-cluster.sh + +set +x +printf '\033[0;32m' +echo "Cluster deletion complete: ${KIND_CLUSTER_NAME}" +printf '\033[0m' diff --git a/demo/clusters/kind/demo.sh b/demo/clusters/kind/demo.sh new file mode 100755 index 000000000..047987f9a --- /dev/null +++ b/demo/clusters/kind/demo.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" + +set -e +set -o pipefail + +source "${CURRENT_DIR}/scripts/common.sh" + +add_repo () { + REPO_COUNT=$(helm repo list | awk 'NR > 1 && $1 == "nvidia" {count++} END {print count+0}') + if [[ ${REPO_COUNT} < 1 ]]; then + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia + helm repo update + fi +} + +clear_old_cluster () { + NUM=$(kind get clusters | grep -Fxc "${KIND_CLUSTER_NAME}" || true) + if [[ ${NUM} == 1 ]]; then + ./delete-cluster.sh + elif [[ ${NUM} -gt 1 ]]; then + echo 'too many clusters debug' + kind get clusters + exit 1 + else + echo 'no clusters to clear' + fi +} + +create_cluster () { + clear_old_cluster + add_repo + ./create-cluster.sh +} + +exec_local () { + create_cluster + ./install-operator.sh local +} + +exec_gdrcopy () { + create_cluster + ./install-operator.sh gdrcopy +} + +exec_release () { + create_cluster + ./install-operator.sh release +} + +exec_bare () { + create_cluster + echo 'As this is a bare-cluster we will end here instead of installing the operator and the gpu-pod' + exit 0 +} + +wait_for_daemonset () { + TARGET_NAMESPACE=$1 + TARGET_DAEMONSET=$2 + kubectl rollout status --timeout=180s -n "${TARGET_NAMESPACE}" "daemonset/${TARGET_DAEMONSET}" +} + +usage () { + echo './demo.sh [CHOICE]' + echo 'where [CHOICE] is one of "bare", "release", "local", or "gdrcopy"' + exit 1 +} + +demo () { + if [[ -z $1 ]]; then + usage + elif [[ $1 == 'release' ]]; then + exec_release + elif [[ $1 == 'local' ]]; then + exec_local + elif [[ $1 == 'gdrcopy' ]]; then + exec_gdrcopy + elif [[ $1 == 'bare' ]]; then + exec_bare + else + echo 'unrecognized option' + usage + fi + wait_for_daemonset gpu-operator nvidia-container-toolkit-daemonset + wait_for_daemonset gpu-operator nvidia-device-plugin-daemonset + kubectl apply -f gpu-pod.yml + sleep 3 + kubectl get pod gpu-pod +} + +time demo "$@" diff --git a/demo/clusters/kind/gpu-pod.yml b/demo/clusters/kind/gpu-pod.yml new file mode 100644 index 000000000..2c347b13b --- /dev/null +++ b/demo/clusters/kind/gpu-pod.yml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 + resources: + limits: + nvidia.com/gpu: 1 # requesting 1 GPU + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/demo/clusters/kind/install-operator.sh b/demo/clusters/kind/install-operator.sh new file mode 100755 index 000000000..35244f04e --- /dev/null +++ b/demo/clusters/kind/install-operator.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage () { + echo 'USAGE:' + echo './install-operator.sh [option]' + echo 'where [option] is one of local, gdrcopy, release, template, template-release' + exit 1 +} + +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" +source "${CURRENT_DIR}/scripts/common.sh" + +# setting default +# but these can be overridden by environment variables +if [[ -z $1 ]]; then + usage +elif [[ $1 == 'local' ]]; then +: ${TARGET_CHART:="${PROJECT_DIR}/deployments/gpu-operator"} +: ${TARGET_ACTION:="upgrade -i"} +: ${XTRA_OPTS:="--wait"} +elif [[ $1 == 'gdrcopy' ]]; then +: ${TARGET_CHART:="${PROJECT_DIR}/deployments/gpu-operator"} +: ${TARGET_ACTION:="upgrade -i"} +: ${XTRA_OPTS:="--wait --set gdrcopy.enabled=true"} +elif [[ $1 == 'release' ]]; then +: ${TARGET_CHART:="nvidia/gpu-operator"} +: ${TARGET_ACTION:="upgrade -i"} +: ${XTRA_OPTS:="--wait"} +elif [[ $1 == 'template' ]]; then +: ${TARGET_CHART:="${PROJECT_DIR}/deployments/gpu-operator"} +: ${TARGET_ACTION:="template"} +: ${XTRA_OPTS:="--output-dir /tmp/gpu-operator"} +elif [[ $1 == 'template-release' ]]; then +: ${TARGET_CHART:="nvidia/gpu-operator"} +: ${TARGET_ACTION:="template"} +: ${XTRA_OPTS:="--output-dir /tmp/gpu-operator-release"} +else + echo unknown usage "$0 $@" + usage +fi + +set -ex +set -o pipefail + +#kubectl label node "${KIND_CLUSTER_NAME}-worker" --overwrite nvidia.com/gpu.present=true + +helm ${TARGET_ACTION} \ + --set cdi.enabled=true \ + --set driver.enabled=false \ + --set operator.runtimeClass=nvidia \ + --set toolkit.enabled=true \ + --set validator.driver.env[0].name="DISABLE_DEV_CHAR_SYMLINK_CREATION" \ + --set-string validator.driver.env[0].value="true" \ + --namespace gpu-operator --create-namespace \ + ${XTRA_OPTS} \ + nvidia-gpu-operator \ + ${TARGET_CHART} + + #--set runtimeClassName=nvidia \ + +set +x +printf '\033[0;32m' +echo "$TARGET_ACTION complete:" +kubectl get pod -n gpu-operator +printf '\033[0m' diff --git a/demo/clusters/kind/scripts/build-kind-image.sh b/demo/clusters/kind/scripts/build-kind-image.sh new file mode 100755 index 000000000..704a4592d --- /dev/null +++ b/demo/clusters/kind/scripts/build-kind-image.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" + +set -ex +set -o pipefail + +source "${CURRENT_DIR}/common.sh" + +# If an image ID already exists for the image we plan to build, we are done. +EXISTING_IMAGE_ID="$(docker images --filter "reference=${KIND_IMAGE}" -q)" +if [ "${EXISTING_IMAGE_ID}" != "" ]; then + exit 0 +fi + +# Create a temporary directory to hold all the artifacts we need for building the image +TMP_DIR="$(mktemp -d)" +cleanup() { + rm -rf "${TMP_DIR}" +} +trap cleanup EXIT + +# Set some build variables +KIND_K8S_REPO="https://github.com/kubernetes/kubernetes.git" +KIND_K8S_DIR="${TMP_DIR}/kubernetes-${KIND_K8S_TAG}" + +# Checkout the version of kubernetes we want to build our kind image from +git clone --depth 1 --branch "${KIND_K8S_TAG}" "${KIND_K8S_REPO}" "${KIND_K8S_DIR}" + +# Build the kind base image +kind build node-image --base-image "${KIND_IMAGE_BASE}" --image "${KIND_IMAGE}" "${KIND_K8S_DIR}" diff --git a/demo/clusters/kind/scripts/common.sh b/demo/clusters/kind/scripts/common.sh new file mode 100644 index 000000000..dc1e20381 --- /dev/null +++ b/demo/clusters/kind/scripts/common.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +SCRIPTS_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" +PROJECT_DIR="$(cd -- "$( dirname -- "${SCRIPTS_DIR}/../../../../.." )" &> /dev/null && pwd)" + +CLUSTER_NAME=gpu-operator-demo + +# The kubernetes tag to build the kind cluster from +# From https://github.com/kubernetes/kubernetes/tags +: ${KIND_K8S_TAG:="v1.27.1"} + +# The name of the kind cluster to create +: ${KIND_CLUSTER_NAME:="${CLUSTER_NAME}-cluster"} + +# The worker node used for toolkit and GPU configuration +: ${KIND_WORKER_NODE:="${KIND_CLUSTER_NAME}-worker"} + +# The path to kind's cluster configuration file +: ${KIND_CLUSTER_CONFIG_PATH:="${SCRIPTS_DIR}/kind-cluster-config.yaml"} + +# The derived name of the kind image to build +: ${KIND_IMAGE_BASE_TAG:="v20230515-01914134-containerd_v1.7.1"} +: ${KIND_IMAGE_BASE:="gcr.io/k8s-staging-kind/base:${KIND_IMAGE_BASE_TAG}"} +: ${KIND_IMAGE:="kindest/node:${KIND_K8S_TAG}-${KIND_IMAGE_BASE_TAG}"} diff --git a/demo/clusters/kind/scripts/create-kind-cluster.sh b/demo/clusters/kind/scripts/create-kind-cluster.sh new file mode 100755 index 000000000..ea1afc3d0 --- /dev/null +++ b/demo/clusters/kind/scripts/create-kind-cluster.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" + +set -ex +set -o pipefail + +source "${CURRENT_DIR}/common.sh" + +worker_exec() { + docker exec "${KIND_WORKER_NODE}" bash -ceu "$1" +} + +kind create cluster \ + --retain \ + --name "${KIND_CLUSTER_NAME}" \ + --image "${KIND_IMAGE}" \ + --config "${KIND_CLUSTER_CONFIG_PATH}" + +# Unmount the masked /proc/driver/nvidia to allow +# dynamically generated MIG devices to be discovered +worker_exec "umount -R /proc/driver/nvidia || true" + +# Install the nvidia-container-toolkit. +# TODO: Once kind supports a more standard GPU runtime bootstrap, we can remove this. +worker_exec "\ + apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates curl gnupg2 \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && apt-get update \ + && apt-get install -y nvidia-container-toolkit \ + && ln -sf /sbin/ldconfig /sbin/ldconfig.real" + +# Configure containerd with an NVIDIA runtime handler for the operator's +# RuntimeClass and enable CDI support for workload injection. +worker_exec "\ + nvidia-ctk config --in-place --set nvidia-container-runtime.modes.cdi.annotation-prefixes=nvidia.cdi.k8s.io/ \ + && nvidia-ctk runtime configure --runtime=containerd --cdi.enabled \ + && systemctl restart containerd" diff --git a/demo/clusters/kind/scripts/delete-kind-cluster.sh b/demo/clusters/kind/scripts/delete-kind-cluster.sh new file mode 100755 index 000000000..f62e21ecc --- /dev/null +++ b/demo/clusters/kind/scripts/delete-kind-cluster.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" + +set -ex +set -o pipefail + +source "${CURRENT_DIR}/common.sh" + +kind delete cluster \ + --name "${KIND_CLUSTER_NAME}" diff --git a/demo/clusters/kind/scripts/kind-cluster-config.yaml b/demo/clusters/kind/scripts/kind-cluster-config.yaml new file mode 100644 index 000000000..677a00cda --- /dev/null +++ b/demo/clusters/kind/scripts/kind-cluster-config.yaml @@ -0,0 +1,46 @@ +# Copyright 2023 The Kubernetes Authors. +# Copyright 2023 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + scheduler: + extraArgs: + v: "1" + controllerManager: + extraArgs: + v: "1" + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + v: "1" +- role: worker + kubeadmConfigPatches: + - | + kind: JoinConfiguration + nodeRegistration: + kubeletExtraArgs: + v: "1" + extraMounts: + # We inject all NVIDIA GPUs using the nvidia-container-runtime. + # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set + # in `/etc/nvidia-container-runtime/config.toml` + - hostPath: /dev/null + containerPath: /var/run/nvidia-container-devices/all