From d04ad8a06dac2efd195ed47e4c686f98965532a1 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Fri, 12 Jun 2026 14:10:08 -0700 Subject: [PATCH 1/5] feat: prepare helm chart for kubernetes deployment options Signed-off-by: Brooke Storm --- .github/actions/changes/action.yaml | 9 + .github/workflows/ci.yaml | 117 +++ .gitignore | 16 +- .pre-commit-config.yaml | 6 +- AGENTS.md | 2 +- k8s/helm/.helmignore | 28 + k8s/helm/Chart.yaml | 18 + k8s/helm/README.md | 287 ++++++ .../ci/01-preexisting-imagepullsecret.yaml | 1 + k8s/helm/ci/02-empty-imagepullsecret.yaml | 2 + k8s/helm/ci/03-global-security-context.yaml | 15 + k8s/helm/ci/04-security-context-override.yaml | 30 + k8s/helm/ci/05-enable-auth.yaml | 5 + k8s/helm/ci/06-kyverno-aws.yaml | 6 + k8s/helm/ci/07-kyverno-azure.yaml | 7 + k8s/helm/ci/08-kyverno-gcp.yaml | 5 + k8s/helm/ci/09-kyverno-oci.yaml | 6 + k8s/helm/ci/10-override-image-registry.yaml | 11 + k8s/helm/ci/11-setting-env.yaml | 17 + .../ci/12-topology-spread-constraints.yaml | 22 + k8s/helm/ci/13-openshift.yaml | 31 + k8s/helm/ci/14-pdb-min-available.yaml | 9 + k8s/helm/ci/14-secretfromenv.yaml | 7 + k8s/helm/ci/15-pdb-max-unavailable.yaml | 9 + k8s/helm/ci/16-enable-ingress.yaml | 6 + k8s/helm/ci/17-ingress-default-host.yaml | 7 + k8s/helm/ci/18-enable-httproute.yaml | 9 + k8s/helm/ci/19-envoy-extra-args.yaml | 11 + k8s/helm/files/nccl-test/entrypoint.sh | 147 +++ k8s/helm/files/nccl-test/nccl-env.sh | 14 + k8s/helm/files/nccl-test/nccl_test.py | 109 ++ k8s/helm/files/nccl-test/orchestrator.py | 602 +++++++++++ .../nemo-helm-readme.md.gotmpl | 7 + k8s/helm/templates/NOTES.txt | 133 +++ k8s/helm/templates/_config-render.tpl | 1 + k8s/helm/templates/_helpers.tpl | 434 ++++++++ k8s/helm/templates/api-env-secret.yaml | 13 + k8s/helm/templates/api/_helpers.tpl | 57 ++ k8s/helm/templates/api/api-deployment.yaml | 151 +++ k8s/helm/templates/api/api-hpa.yaml | 39 + k8s/helm/templates/api/api-pdb.yaml | 25 + k8s/helm/templates/api/api-service.yaml | 23 + .../templates/api/api-serviceaccount.yaml | 16 + .../templates/api/api-servicemonitor.yaml | 37 + k8s/helm/templates/core/_helpers.tpl | 92 ++ .../templates/core/controller-deployment.yaml | 133 +++ k8s/helm/templates/core/controller-role.yaml | 67 ++ .../core/controller-service-headless.yaml | 24 + .../core/controller-serviceaccount.yaml | 16 + .../core/controller-servicemonitor.yaml | 37 + .../templates/core/jobs-serviceaccount.yaml | 16 + k8s/helm/templates/core/shared-pvc.yaml | 25 + k8s/helm/templates/httproute.yaml | 40 + k8s/helm/templates/imagepull-secret.yaml | 15 + k8s/helm/templates/ingress.yaml | 56 + .../templates/models-files-auth-secret.yaml | 12 + .../templates/networking/kyverno-policy.yaml | 549 ++++++++++ .../networking/nccl-topology-configmap.yaml | 78 ++ k8s/helm/templates/ngc-api-secret.yaml | 12 + k8s/helm/templates/openshift-route.yaml | 28 + k8s/helm/templates/platform-configmap.yaml | 9 + k8s/helm/templates/platform-seed-job.yaml | 81 ++ .../templates/postgres/postgres-secret.yaml | 12 + .../templates/postgres/postgres-service.yaml | 19 + .../postgres/postgres-serviceaccount.yaml | 14 + .../postgres/postgres-statefulset.yaml | 122 +++ k8s/helm/templates/proxy/_helpers.tpl | 25 + k8s/helm/templates/proxy/envoy-configmap.yaml | 68 ++ .../templates/proxy/envoy-deployment.yaml | 91 ++ k8s/helm/templates/proxy/envoy-hpa.yaml | 38 + k8s/helm/templates/proxy/envoy-service.yaml | 26 + .../templates/proxy/envoy-serviceaccount.yaml | 15 + .../templates/proxy/envoy-servicemonitor.yaml | 38 + k8s/helm/templates/tests/nccl-test.yaml | 314 ++++++ k8s/helm/values.yaml | 956 ++++++++++++++++++ services/core/jobs/README.md | 8 +- services/guardrails/callouts/README.md | 2 +- tools/lint/lint-helm.sh | 2 +- .../studio/scripts/feature-flag-matrix.ts | 23 +- 79 files changed, 5538 insertions(+), 32 deletions(-) create mode 100644 k8s/helm/.helmignore create mode 100644 k8s/helm/Chart.yaml create mode 100644 k8s/helm/README.md create mode 100644 k8s/helm/ci/01-preexisting-imagepullsecret.yaml create mode 100644 k8s/helm/ci/02-empty-imagepullsecret.yaml create mode 100644 k8s/helm/ci/03-global-security-context.yaml create mode 100644 k8s/helm/ci/04-security-context-override.yaml create mode 100644 k8s/helm/ci/05-enable-auth.yaml create mode 100644 k8s/helm/ci/06-kyverno-aws.yaml create mode 100644 k8s/helm/ci/07-kyverno-azure.yaml create mode 100644 k8s/helm/ci/08-kyverno-gcp.yaml create mode 100644 k8s/helm/ci/09-kyverno-oci.yaml create mode 100644 k8s/helm/ci/10-override-image-registry.yaml create mode 100644 k8s/helm/ci/11-setting-env.yaml create mode 100644 k8s/helm/ci/12-topology-spread-constraints.yaml create mode 100644 k8s/helm/ci/13-openshift.yaml create mode 100644 k8s/helm/ci/14-pdb-min-available.yaml create mode 100644 k8s/helm/ci/14-secretfromenv.yaml create mode 100644 k8s/helm/ci/15-pdb-max-unavailable.yaml create mode 100644 k8s/helm/ci/16-enable-ingress.yaml create mode 100644 k8s/helm/ci/17-ingress-default-host.yaml create mode 100644 k8s/helm/ci/18-enable-httproute.yaml create mode 100644 k8s/helm/ci/19-envoy-extra-args.yaml create mode 100644 k8s/helm/files/nccl-test/entrypoint.sh create mode 100644 k8s/helm/files/nccl-test/nccl-env.sh create mode 100644 k8s/helm/files/nccl-test/nccl_test.py create mode 100644 k8s/helm/files/nccl-test/orchestrator.py create mode 100644 k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl create mode 100644 k8s/helm/templates/NOTES.txt create mode 100644 k8s/helm/templates/_config-render.tpl create mode 100644 k8s/helm/templates/_helpers.tpl create mode 100644 k8s/helm/templates/api-env-secret.yaml create mode 100644 k8s/helm/templates/api/_helpers.tpl create mode 100644 k8s/helm/templates/api/api-deployment.yaml create mode 100644 k8s/helm/templates/api/api-hpa.yaml create mode 100644 k8s/helm/templates/api/api-pdb.yaml create mode 100644 k8s/helm/templates/api/api-service.yaml create mode 100644 k8s/helm/templates/api/api-serviceaccount.yaml create mode 100644 k8s/helm/templates/api/api-servicemonitor.yaml create mode 100644 k8s/helm/templates/core/_helpers.tpl create mode 100644 k8s/helm/templates/core/controller-deployment.yaml create mode 100644 k8s/helm/templates/core/controller-role.yaml create mode 100644 k8s/helm/templates/core/controller-service-headless.yaml create mode 100644 k8s/helm/templates/core/controller-serviceaccount.yaml create mode 100644 k8s/helm/templates/core/controller-servicemonitor.yaml create mode 100644 k8s/helm/templates/core/jobs-serviceaccount.yaml create mode 100644 k8s/helm/templates/core/shared-pvc.yaml create mode 100644 k8s/helm/templates/httproute.yaml create mode 100644 k8s/helm/templates/imagepull-secret.yaml create mode 100644 k8s/helm/templates/ingress.yaml create mode 100644 k8s/helm/templates/models-files-auth-secret.yaml create mode 100644 k8s/helm/templates/networking/kyverno-policy.yaml create mode 100644 k8s/helm/templates/networking/nccl-topology-configmap.yaml create mode 100644 k8s/helm/templates/ngc-api-secret.yaml create mode 100644 k8s/helm/templates/openshift-route.yaml create mode 100644 k8s/helm/templates/platform-configmap.yaml create mode 100644 k8s/helm/templates/platform-seed-job.yaml create mode 100644 k8s/helm/templates/postgres/postgres-secret.yaml create mode 100644 k8s/helm/templates/postgres/postgres-service.yaml create mode 100644 k8s/helm/templates/postgres/postgres-serviceaccount.yaml create mode 100644 k8s/helm/templates/postgres/postgres-statefulset.yaml create mode 100644 k8s/helm/templates/proxy/_helpers.tpl create mode 100644 k8s/helm/templates/proxy/envoy-configmap.yaml create mode 100644 k8s/helm/templates/proxy/envoy-deployment.yaml create mode 100644 k8s/helm/templates/proxy/envoy-hpa.yaml create mode 100644 k8s/helm/templates/proxy/envoy-service.yaml create mode 100644 k8s/helm/templates/proxy/envoy-serviceaccount.yaml create mode 100644 k8s/helm/templates/proxy/envoy-servicemonitor.yaml create mode 100644 k8s/helm/templates/tests/nccl-test.yaml create mode 100644 k8s/helm/values.yaml diff --git a/.github/actions/changes/action.yaml b/.github/actions/changes/action.yaml index e8420259ae..d91f2d8715 100644 --- a/.github/actions/changes/action.yaml +++ b/.github/actions/changes/action.yaml @@ -28,6 +28,9 @@ outputs: docker: description: "'true' if any Docker build files changed" value: ${{ steps.filter.outputs.docker }} + helm: + description: "'true' if any Helm chart files changed" + value: ${{ steps.filter.outputs.helm }} runs: using: "composite" @@ -63,3 +66,9 @@ runs: - 'docker-bake.hcl' - 'docker/**' - 'Makefile' + helm: + - 'k8s/**' + - 'tools/lint/lint-helm.sh' + - '.github/workflows/ci.yaml' + - '.github/actions/changes/action.yaml' + - '.pre-commit-config.yaml' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3647f00e9c..ecb2547d5e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,6 +31,7 @@ jobs: web-studio: ${{ steps.changes.outputs.web-studio }} tools: ${{ steps.changes.outputs.tools }} docker: ${{ steps.changes.outputs.docker }} + helm: ${{ steps.changes.outputs.helm }} steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - uses: ./.github/actions/changes @@ -82,6 +83,120 @@ jobs: make docker-print TARGET=nmp-automodel make docker-print TARGET=nmp-unsloth + helm-lint: + name: Helm lint + needs: [changes] + if: > + !cancelled() && ( + github.event_name == 'workflow_dispatch' || + needs.changes.outputs.helm == 'true' + ) + runs-on: ubuntu-latest + permissions: + contents: read + env: + HELM_FOLDER: k8s/helm + HELM_RELEASE_NAME: nemo-platform + KUBECONFORM_VERSION: v0.6.7 + steps: + - name: Checkout code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - name: Install Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + - name: Install kubeconform + shell: bash + run: | + set -euo pipefail + + mkdir -p "${RUNNER_TEMP}/kubeconform" + curl -fsSL \ + -o "${RUNNER_TEMP}/kubeconform.tar.gz" \ + "https://github.com/yannh/kubeconform/releases/download/${KUBECONFORM_VERSION}/kubeconform-linux-amd64.tar.gz" + tar -xzf "${RUNNER_TEMP}/kubeconform.tar.gz" -C "${RUNNER_TEMP}/kubeconform" kubeconform + chmod +x "${RUNNER_TEMP}/kubeconform/kubeconform" + echo "${RUNNER_TEMP}/kubeconform" >> "${GITHUB_PATH}" + - name: Add NVIDIA Helm repo + shell: bash + run: | + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia + helm repo update + - name: Lint and validate Helm chart + shell: bash + run: tools/lint/lint-helm.sh + - name: Upload Helm lint artifacts + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: helm-lint-artifacts + retention-days: 7 + if-no-files-found: ignore + path: | + k8s/helm/ci/*.output + k8s/helm/ci/*.kubeconform.json + + helm-chart-verifier: + name: Helm chart verifier + needs: [changes] + if: > + !cancelled() && ( + github.event_name == 'workflow_dispatch' || + needs.changes.outputs.helm == 'true' + ) + runs-on: ubuntu-latest + permissions: + contents: read + env: + CHART_VERIFIER_VERSION: "1.14.0" + HELM_FOLDER: k8s/helm + steps: + - name: Checkout code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - name: Install Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + - name: Install chart-verifier + shell: bash + run: | + set -euo pipefail + + mkdir -p "${RUNNER_TEMP}/chart-verifier" + curl -fsSL \ + -o "${RUNNER_TEMP}/chart-verifier.tar.gz" \ + "https://github.com/redhat-certification/chart-verifier/releases/download/${CHART_VERIFIER_VERSION}/chart-verifier-${CHART_VERIFIER_VERSION}.tgz" + tar -xzf "${RUNNER_TEMP}/chart-verifier.tar.gz" -C "${RUNNER_TEMP}/chart-verifier" chart-verifier + chmod +x "${RUNNER_TEMP}/chart-verifier/chart-verifier" + echo "${RUNNER_TEMP}/chart-verifier" >> "${GITHUB_PATH}" + - name: Prepare Helm dependencies + shell: bash + run: | + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia + helm repo update + helm dep update "${HELM_FOLDER}" + - name: Run chart verifier + shell: bash + run: | + set -euo pipefail + + report_json="${RUNNER_TEMP}/chart-verifier-report.json" + chart-verifier verify --enable helm-lint,is-helm-v3,contains-values,has-readme \ + -o json "${HELM_FOLDER}" | tee "${report_json}" + + python3 - "${report_json}" <<'PY' + import json + import sys + + with open(sys.argv[1], encoding="utf-8") as report_file: + data = json.load(report_file) + + failed = [ + result.get("check", "?") + for result in data.get("results", []) + if result.get("outcome") == "FAIL" + ] + if failed: + print("chart-verifier failed checks: " + ", ".join(failed), file=sys.stderr) + sys.exit(1) + PY + lint: name: Lint all runs-on: ubuntu-latest @@ -782,6 +897,8 @@ jobs: - changes - actionlint - docker-bake-graph + - helm-lint + - helm-chart-verifier - lint - policy-wasm - python-unit-test-tools diff --git a/.gitignore b/.gitignore index 8f6a2d7714..d90f2ac3dd 100644 --- a/.gitignore +++ b/.gitignore @@ -61,20 +61,10 @@ services/core/auth/src/nmp/core/auth/assets/policy.wasm logs/* *.log *.log.jsonl -helm/platform/charts/ -helm/platform/Chart.lock -helm/platform/components/*/Chart.lock -helm/platform/components/*/charts/*.tgz -helm/platform-ea/components/*/charts/*.tgz -helm/platform-ea/charts/ -helm/platform-ea/Chart.lock -helm/platform-ea/components/*/Chart.lock -deploy/helm/platform/charts/ -deploy/helm/platform/Chart.lock -deploy/helm/platform/components/*/Chart.lock -deploy/helm/platform/components/*/charts/*.tgz +k8s/helm/Chart.lock +k8s/helm/charts/ chart-tmp -nemo-microservices-helm-chart-*.tgz +nemo-platform-*.tgz # Visual Studio Code config .vscode/ # PyCharm config diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 78e523e12f..30370b64a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -70,10 +70,10 @@ repos: rev: "v1.14.2" hooks: - id: helm-docs-container - files: '^deploy\/helm\/platform\/values.yaml$' + files: '^k8s\/helm\/values.yaml$' args: - - --chart-search-root=deploy/helm/platform/ - - --template-files=deploy/helm/platform/helm-docs-template/nemo-helm-readme.md.gotmpl + - --chart-search-root=k8s/helm/ + - --template-files=k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl - repo: local hooks: diff --git a/AGENTS.md b/AGENTS.md index 0bd8af5ff6..6fce3f0678 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -204,7 +204,7 @@ Pre-commit hooks run automatically before commits and pushes to ensure code qual - **Type checking (ty)** - Runs type checks on Python code (may need manual fixes) - **uv lock** - Automatically updates `uv.lock` when `pyproject.toml` changes - **uv lock check** - Verifies `uv.lock` is in sync with `pyproject.toml` -- **Helm Docs Container** - Runs `helm-docs` container to regenerate Helm documentation in `deploy/helm/platform/README.md` +- **Helm Docs Container** - Runs `helm-docs` container to regenerate Helm documentation in `k8s/helm/README.md` - **Check merge conflicts** - Detects merge conflict markers - **OpenAPI generator** (manual stage) - Regenerates OpenAPI spec when API files change - **Check policy WASM** (pre-push only) - Verifies OPA policy WASM is up-to-date diff --git a/k8s/helm/.helmignore b/k8s/helm/.helmignore new file mode 100644 index 0000000000..deba092c30 --- /dev/null +++ b/k8s/helm/.helmignore @@ -0,0 +1,28 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +# Ignore CI sources +ci/ + +# helm docs template +helm-docs-template/ diff --git a/k8s/helm/Chart.yaml b/k8s/helm/Chart.yaml new file mode 100644 index 0000000000..3266ff779c --- /dev/null +++ b/k8s/helm/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: nemo-platform +maintainers: + - name: NVIDIA, Inc. + url: https://www.nvidia.com +description: NeMo Platform Helm Chart +type: application +# version is the version of the Helm chart, and can be different from the appVersion. +version: 2.1.0 +# appVersion is the version of the application deployed in the chart. +appVersion: "26.3.0" +home: https://nvidia.com +dependencies: + ## NMP dependencies + - name: k8s-nim-operator + condition: k8s-nim-operator.enabled + repository: https://helm.ngc.nvidia.com/nvidia/ + version: "~3.1.0" diff --git a/k8s/helm/README.md b/k8s/helm/README.md new file mode 100644 index 0000000000..b0620c6334 --- /dev/null +++ b/k8s/helm/README.md @@ -0,0 +1,287 @@ +# NeMo Platform Helm Chart + +![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) + +Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| additionalImagePullSecrets | object | `{}` | List of additional image pull secrets to use for pulling container images. Can be used when multiple image pull secrets are required in your environment. | +| api | object | This object has the following default values for the API configuration. | API configuration settings for the api deployment | +| api.affinity | object | `{}` | Affinity configuration for the API service. | +| api.annotations | object | `{}` | Annotations to add to the API service deployment. | +| api.autoscaling | object | `{"annotations":{},"enabled":false,"maxReplicas":10,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Specifies autoscaling configurations for the deployment. | +| api.autoscaling.annotations | object | `{}` | Annotations for the HorizontalPodAutoscaler. | +| api.autoscaling.enabled | bool | `false` | Whether to enable horizontal pod autoscaler. | +| api.autoscaling.maxReplicas | int | `10` | The maximum number of replicas for the deployment. | +| api.autoscaling.minReplicas | int | `1` | The minimum number of replicas for the deployment. | +| api.autoscaling.targetCPUUtilizationPercentage | int | `80` | The target CPU utilization percentage. | +| api.enabled | bool | `true` | Specifies whether to enable the api deployment. | +| api.extraArgs | list | `[]` | Additional arguments to pass to the Platform API service | +| api.image | object | This object has the following default values for the image configuration. | Container image configuration for the api deployment. | +| api.image.pullPolicy | string | `"IfNotPresent"` | The image pull policy determining when to pull new images. | +| api.image.repository | string | `"nvcr.io/nvidia/nemo/nmp-api"` | The registry where the NeMo Platform image is located. | +| api.image.tag | string | `""` | The image tag to use. | +| api.livenessProbe | object | This object has the following default values for the liveness probe configuration. | Liveness probe configuration for the api service. | +| api.livenessProbe.failureThreshold | int | `3` | The failure threshold for the liveness probe. | +| api.livenessProbe.httpGet | object | `{"path":"/health/live","port":"http"}` | The HTTP GET request to use for the liveness probe. | +| api.livenessProbe.periodSeconds | int | `10` | The frequency in seconds to perform the liveness probe. | +| api.livenessProbe.timeoutSeconds | int | `5` | The timeout in seconds for the liveness probe. | +| api.nodeSelector | object | `{}` | Node selector configuration for the API service. | +| api.podAnnotations | object | `{}` | Annotations to add to the API service pod. | +| api.podDisruptionBudget | object | This object has the following default values for the pod disruption budget configuration. | PodDisruptionBudget configuration for the API service. | +| api.podDisruptionBudget.annotations | object | `{}` | Annotations for the PodDisruptionBudget. | +| api.podDisruptionBudget.enabled | bool | `false` | Whether to create a PodDisruptionBudget for the API pods. | +| api.podDisruptionBudget.minAvailable | int | `1` | Minimum number of API pods that must remain available during voluntary disruptions. Only one of minAvailable or maxUnavailable may be set. | +| api.podLabels | object | `{}` | Labels for the API service pod. | +| api.podSecurityContext | object | This object has the following default values for the pod security context. | Pod-level security context settings for the API service. | +| api.podSecurityContext.fsGroup | int | `1000` | The file system group ID to use for all containers. | +| api.readinessProbe | object | This object has the following default values for the readiness probe configuration. | Readiness probe configuration for the api service. | +| api.readinessProbe.failureThreshold | int | `3` | The failure threshold for the readiness probe. | +| api.readinessProbe.httpGet | object | `{"path":"/health/ready","port":"http"}` | The HTTP GET request to use for the readiness probe. | +| api.readinessProbe.periodSeconds | int | `10` | The frequency in seconds to perform the readiness probe. | +| api.readinessProbe.timeoutSeconds | int | `5` | The timeout in seconds for the readiness probe. | +| api.replicaCount | int | `1` | Number of replicas for the API service. | +| api.resources | object | `{}` | Kubernetes deployment resources configuration for the API service. | +| api.securityContext | object | `{}` | Container-level security context settings for the API service. | +| api.service | object | This object has the following default values for the service configuration. | Service configuration for the API service. | +| api.service.annotations | object | `{}` | Annotations for the API service. | +| api.service.port | int | `8080` | The port number to expose for the service. | +| api.service.type | string | `"ClusterIP"` | The Kubernetes service type to create. | +| api.serviceAccount | object | This object has the following default values for the service account configuration. | Service account configuration for the API service. | +| api.serviceAccount.annotations | object | `{}` | Annotations to add to the service account. | +| api.serviceAccount.automount | bool | `true` | Automatically mount a ServiceAccount's API credentials. | +| api.serviceAccount.create | bool | `true` | Specifies whether a service account should be created. | +| api.serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template. | +| api.serviceMonitor.annotations | object | `{}` | Additional annotations to add to the ServiceMonitor | +| api.serviceMonitor.enabled | bool | `false` | Enable ServiceMonitor resources for Prometheus Operator | +| api.serviceMonitor.interval | string | `"30s"` | Scrape interval for the ServiceMonitor | +| api.serviceMonitor.labels | object | `{}` | Additional labels to add to the ServiceMonitor | +| api.serviceMonitor.scheme | string | `"http"` | Scheme to use for scraping metrics (http or https) | +| api.startupProbe | object | This object has the following default values for the startup probe configuration. | Startup probe configuration for the api service. | +| api.startupProbe.failureThreshold | int | `24` | The failure threshold for the startup probe. | +| api.startupProbe.httpGet | object | `{"path":"/health/ready","port":"http"}` | The HTTP GET request to use for the startup probe. | +| api.startupProbe.initialDelaySeconds | int | `10` | Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting). | +| api.startupProbe.periodSeconds | int | `15` | The frequency in seconds to perform the startup probe. | +| api.startupProbe.timeoutSeconds | int | `5` | The timeout in seconds for the startup probe. | +| api.telemetry | object | `{}` | OpenTelemetry configuration overrides for the api deployment. | +| api.tolerations | list | `[]` | Tolerations configuration for the API service. | +| api.topologySpreadConstraints | list | `[]` | Topology spread constraints for the API service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ | +| basePlatformConfig | string | This object has the following default values for the base platform configuration. | Base platform configuration settings | +| core | object | This object has the following default values for the core deployment configuration. | Core deployment configuration settings | +| core.controller.affinity | object | `{}` | Affinity configuration for the controller service. | +| core.controller.annotations | object | `{}` | Annotations to add to the controller service deployment. | +| core.controller.env | object | `{}` | Additional environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}. | +| core.controller.extraArgs | list | `[]` | Additional arguments to pass to the Core Controller service | +| core.controller.livenessProbe | object | This object has the following default values for the liveness probe configuration. | Liveness probe configuration for the controller service. | +| core.controller.livenessProbe.failureThreshold | int | `3` | The failure threshold for the readiness probe. | +| core.controller.livenessProbe.httpGet | object | `{"path":"/health/live","port":"http"}` | The HTTP GET request to use for the readiness probe. | +| core.controller.livenessProbe.periodSeconds | int | `10` | The frequency in seconds to perform the readiness probe. | +| core.controller.livenessProbe.timeoutSeconds | int | `5` | The timeout in seconds for the readiness probe. | +| core.controller.nodeSelector | object | `{}` | Node selector configuration for the controller service. | +| core.controller.podAnnotations | object | `{}` | Annotations to add to the controller service pod. | +| core.controller.podLabels | object | `{}` | Labels for the controller service pod. | +| core.controller.podSecurityContext | object | This object has the following default values for the pod security context. | Pod-level security context settings for the controller service. | +| core.controller.podSecurityContext.fsGroup | int | `1000` | The file system group ID to use for all containers. | +| core.controller.readinessProbe | object | This object has the following default values for the readiness probe configuration. | Readiness probe configuration for the controller service. | +| core.controller.readinessProbe.failureThreshold | int | `3` | The failure threshold for the readiness probe. | +| core.controller.readinessProbe.httpGet | object | `{"path":"/health/ready","port":"http"}` | The HTTP GET request to use for the readiness probe. | +| core.controller.readinessProbe.periodSeconds | int | `10` | The frequency in seconds to perform the readiness probe. | +| core.controller.readinessProbe.timeoutSeconds | int | `5` | The timeout in seconds for the readiness probe. | +| core.controller.resources | object | `{}` | Kubernetes deployment resources configuration for the controller service. | +| core.controller.securityContext | object | `{}` | Container-level security context settings for the controller service. | +| core.controller.service | object | This object has the following default values for the service configuration. | Service configuration for the controller service. This only configures a headless service for DNS resolution. | +| core.controller.service.annotations | object | `{}` | Annotations for the headless controller service. | +| core.controller.service.port | int | `8080` | The port for the service. | +| core.controller.serviceAccount | object | This object has the following default values for the service account configuration. | Service account configuration for the controller service. | +| core.controller.serviceAccount.annotations | object | `{}` | Annotations to add to the service account. | +| core.controller.serviceAccount.automount | bool | `true` | Automatically mount a ServiceAccount's API credentials. | +| core.controller.serviceAccount.create | bool | `true` | Specifies whether a service account should be created. | +| core.controller.serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template. | +| core.controller.startupProbe | object | This object has the following default values for the startup probe configuration. | Startup probe configuration for the core service. | +| core.controller.startupProbe.failureThreshold | int | `24` | The failure threshold for the startup probe. | +| core.controller.startupProbe.httpGet | object | `{"path":"/health/ready","port":"http"}` | The HTTP GET request to use for the startup probe. | +| core.controller.startupProbe.initialDelaySeconds | int | `10` | Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting). | +| core.controller.startupProbe.periodSeconds | int | `15` | The frequency in seconds to perform the startup probe. | +| core.controller.startupProbe.timeoutSeconds | int | `5` | The timeout in seconds for the startup probe. | +| core.controller.tolerations | list | `[]` | Tolerations configuration for the controller service. | +| core.controller.topologySpreadConstraints | list | `[]` | Topology spread constraints for the controller service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ | +| core.enabled | bool | `true` | Specifies whether to enable the core deployment. | +| core.image | object | This object has the following default values for the image configuration. | Container image configuration for the core deployment. | +| core.image.pullPolicy | string | `"IfNotPresent"` | The image pull policy determining when to pull new images. | +| core.image.repository | string | `"nvcr.io/nvidia/nemo/nmp-api"` | The registry where the NeMo Platform image is located. | +| core.image.tag | string | `""` | The image tag to use. | +| core.jobs | object | This object has the following default values for the jobs service account configuration. | Service account configuration for pods created by the jobs controller (Kubernetes/Volcano job pods). | +| core.jobs.serviceAccount.annotations | object | `{}` | Annotations to add to the service account. | +| core.jobs.serviceAccount.automount | bool | `true` | Automatically mount a ServiceAccount's API credentials. | +| core.jobs.serviceAccount.create | bool | `true` | Specifies whether a service account should be created for job pods. | +| core.jobs.serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated with a '-jobs' suffix. | +| core.serviceMonitor.annotations | object | `{}` | Additional annotations to add to the ServiceMonitor | +| core.serviceMonitor.enabled | bool | `false` | Enable ServiceMonitor resources for Prometheus Operator | +| core.serviceMonitor.interval | string | `"30s"` | Scrape interval for the ServiceMonitor | +| core.serviceMonitor.labels | object | `{}` | Additional labels to add to the ServiceMonitor | +| core.serviceMonitor.scheme | string | `"http"` | Scheme to use for scraping metrics (http or https) | +| core.storage.accessModes | list | `["ReadWriteMany"]` | accessModes for the persistent volume claim. This should include `ReadWriteMany` to ensure multiple job pods can write to the volume concurrently. | +| core.storage.annotations | object | `{}` | Annotations to add to the persistent volume claim | +| core.storage.existingPersistentVolumeName | string | `""` | If set, pods will mount this persistent volume for job-scoped storage and we will not create a new persistent volume claim. | +| core.storage.size | string | `"200Gi"` | size of the persistent volume claim used for persistent storage | +| core.storage.storageClass | string | `""` | Which storageClass to use when creating a new persistent volume claim. Empty string uses the cluster's default StorageClass. | +| core.storage.volumePermissionsImage | string | `"busybox"` | volumePermissionsImage is the image used to set permissions on the volume | +| core.telemetry | object | `{}` | OpenTelemetry configuration overrides for the platform deployment. | +| env | object | `{}` | Environment variables that will be applied to every deployment pod. Uses a simple key value map structure like MY_ENV_VAR: the-key and works with valueFrom as well. | +| envFromSecret | string | `""` | Optional. Name of an existing Kubernetes Secret to load as env vars (envFrom) for the API pod. When set, the chart does not create the default api-env secret; use your own secret (e.g. from Vault, sealed-secrets). When unset, the chart creates a default secret with the environment variable NMP_SECRETS_DEFAULT_ENCRYPTION_KEY for default installation. See the NeMo Platform documentation for more details on secrets encryption. | +| envoyProxy | object | This object has the following default values for the envoy proxy configuration. | Envoy proxy configuration settings. Resources are created only when platform config has auth.enabled: true (see platformConfig.auth.enabled). | +| envoyProxy.adminPort | int | `9901` | Envoy Admin port | +| envoyProxy.affinity | object | `{}` | Affinity configuration for the Envoy pods. | +| envoyProxy.annotations | object | `{}` | Annotations to add to the Envoy service deployment. | +| envoyProxy.autoscaling | object | `{"annotations":{},"enabled":false,"maxReplicas":10,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Specifies autoscaling configurations for the deployment. | +| envoyProxy.autoscaling.annotations | object | `{}` | Annotations for the HorizontalPodAutoscaler. | +| envoyProxy.autoscaling.enabled | bool | `false` | Whether to enable horizontal pod autoscaler. | +| envoyProxy.autoscaling.maxReplicas | int | `10` | The maximum number of replicas for the deployment. | +| envoyProxy.autoscaling.minReplicas | int | `1` | The minimum number of replicas for the deployment. | +| envoyProxy.autoscaling.targetCPUUtilizationPercentage | int | `80` | The target CPU utilization percentage. | +| envoyProxy.enabled | bool | `true` | Specifies whether to enable the Envoy proxy deployment. Rendered only when platform config has auth.enabled: true. | +| envoyProxy.extraArgs | list | `[]` | Extra arguments to append to the envoy container command. Useful for passing server flags such as concurrency. Example: ["--concurrency", "4"] | +| envoyProxy.livenessProbe | object | `{"failureThreshold":3,"httpGet":{"path":"/ready","port":"admin"},"periodSeconds":10,"timeoutSeconds":5}` | Liveness probe for the Envoy container (admin interface /ready). | +| envoyProxy.nodeSelector | object | `{}` | Node selector configuration for the Envoy pods. | +| envoyProxy.podAnnotations | object | `{}` | Annotations to add to the Envoy service pod. | +| envoyProxy.podDisruptionBudget | object | This object has the following default values for the pod disruption budget configuration. | PodDisruptionBudget configuration for the Envoy service. | +| envoyProxy.podDisruptionBudget.annotations | object | `{}` | Annotations for the PodDisruptionBudget. | +| envoyProxy.podDisruptionBudget.enabled | bool | `false` | Whether to create a PodDisruptionBudget for the Envoy pods. | +| envoyProxy.podDisruptionBudget.minAvailable | int | `1` | Minimum number of Envoy pods that must remain available during voluntary disruptions. Only one of minAvailable or maxUnavailable may be set. | +| envoyProxy.podLabels | object | `{}` | Labels for the Envoy service pod. | +| envoyProxy.podSecurityContext | object | This object has the following default values for the pod security context. | Pod-level security context settings for the Envoy service. | +| envoyProxy.podSecurityContext.fsGroup | int | `1000` | The file system group ID to use for all containers. | +| envoyProxy.readinessProbe | object | `{"failureThreshold":3,"httpGet":{"path":"/ready","port":"admin"},"periodSeconds":10,"timeoutSeconds":5}` | Readiness probe for the Envoy container (admin interface /ready). | +| envoyProxy.resources | object | `{}` | Kubernetes deployment resources configuration for the Envoy service. | +| envoyProxy.securityContext | object | `{}` | Container-level security context settings for the Envoy service. | +| envoyProxy.service | object | This object has the following default values for the service configuration. | Service configuration for the Envoy service. | +| envoyProxy.service.annotations | object | `{}` | Annotations for the Envoy service. | +| envoyProxy.service.port | int | `8080` | The port number to expose for the service. | +| envoyProxy.service.type | string | `"ClusterIP"` | The Kubernetes service type to create. | +| envoyProxy.serviceAccount | object | This object has the following default values for the service account configuration. | Service account configuration for the Envoy service. | +| envoyProxy.serviceAccount.annotations | object | `{}` | Annotations to add to the service account. | +| envoyProxy.serviceAccount.automount | bool | `true` | Automatically mount a ServiceAccount's API credentials. | +| envoyProxy.serviceAccount.create | bool | `true` | Specifies whether a service account should be created. | +| envoyProxy.serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template. | +| envoyProxy.serviceMonitor.annotations | object | `{}` | Additional annotations to add to the ServiceMonitor | +| envoyProxy.serviceMonitor.enabled | bool | `false` | Enable ServiceMonitor resources for Prometheus Operator | +| envoyProxy.serviceMonitor.interval | string | `"30s"` | Scrape interval for the ServiceMonitor | +| envoyProxy.serviceMonitor.labels | object | `{}` | Additional labels to add to the ServiceMonitor | +| envoyProxy.serviceMonitor.scheme | string | `"http"` | Scheme to use for scraping metrics (http or https) | +| envoyProxy.startupProbe | object | `{"failureThreshold":12,"httpGet":{"path":"/ready","port":"admin"},"periodSeconds":5,"timeoutSeconds":3}` | Startup probe for the Envoy container (admin interface /ready). | +| envoyProxy.timeouts | object | Tuned for streaming; increase or set to "0s" if requests are cut off. | Timeouts for proxying to long-lived streams (e.g. inference gateway). Use "0s" to disable a timeout. | +| envoyProxy.timeouts.connect | string | `"30s"` | Cluster connect timeout (time to establish connection to backend). | +| envoyProxy.timeouts.request | string | `"0s"` | Total request timeout. 0 = disabled (required for streaming; not compatible with streaming if set). | +| envoyProxy.timeouts.requestHeaders | string | `"60s"` | Time to receive full request headers. 0 = disabled. | +| envoyProxy.timeouts.route | string | `"0s"` | Per-route timeout for the passthrough to backend. 0 = disabled. | +| envoyProxy.timeouts.streamIdle | string | `"0s"` | Stream idle timeout. Time with no activity before stream is closed. 0 = disabled (required for long-lived streams). | +| envoyProxy.tolerations | list | `[]` | Tolerations configuration for the Envoy pods. | +| envoyProxy.topologySpreadConstraints | list | `[]` | Topology spread constraints for the Envoy pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ | +| existingImagePullSecret | string | `"nvcrimagepullsecret"` | You can specify an existing Kubernetes image pull secret for pulling images from the NGC container registry. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string. | +| existingSecret | string | `"ngc-api"` | You can use an existing Kubernetes secret for communicating with the NGC API for downloading models. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string. | +| externalDatabase | object | This object has the following default values for the external PostgreSQL configuration. | External PostgreSQL configuration settings. These values are only used when postgresql.enabled is set to false. | +| externalDatabase.database | string | `"nemoplatform"` | Database name. | +| externalDatabase.existingSecret | string | `""` | Name of an existing secret resource containing the database credentials. | +| externalDatabase.existingSecretPasswordKey | string | `""` | Name of an existing secret key containing the database credentials. | +| externalDatabase.host | string | `"localhost"` | External database host address. | +| externalDatabase.port | int | `5432` | External database port number. | +| externalDatabase.uriSecret | object | This object has the following default values for the URI secret configuration. | URI secret configuration for external database. | +| externalDatabase.uriSecret.key | string | `""` | Key in the URI secret containing the database URI. | +| externalDatabase.uriSecret.name | string | `""` | Name of the URI secret. | +| externalDatabase.user | string | `"nemo"` | Database username | +| fullnameOverride | string | `""` | | +| httpRoute.annotations | object | `{}` | Extra annotations for the HTTP Route object. | +| httpRoute.enabled | bool | `false` | Specifies whether to enable a Gateway API HTTP Route for the service. | +| httpRoute.filters | list | `[]` | This is a list of filters for the objects, such as CORS settings. | +| httpRoute.hostnames | list | `[]` | If this has a specific hostname, add the name or names here in an array. | +| httpRoute.labels | object | `{}` | Extra labels for the HTTP Route object. | +| httpRoute.parentRefs | list | `[]` | A list of Gateways to enable this route on. This is required if httpsRoute.enabled is true. | +| httpRoute.pathRules | list | `[{"backends":[{"port":"{{ include \"nemo-platform.ingressBackendPort\" . }}","service":"{{ include \"nemo-platform.ingressBackendService\" . }}"}],"matches":[{"path":"/","type":"Exact"},{"path":"/apis","type":"PathPrefix"},{"path":"/studio","type":"PathPrefix"},{"path":"/cluster-info","type":"Exact"},{"path":"/status","type":"Exact"}]}]` | Path matches to route queries. | +| ingress.annotations | object | `{}` | Annotations for the ingress resource. | +| ingress.className | string | `""` | The ingress class to use if your cluster has more than one class. | +| ingress.defaultHost | string | `""` | Optional default hostname. When set, one rule is generated with this host and paths from the first entry in ingress.hosts. | +| ingress.enabled | bool | `false` | Specifies whether to enable the ingress. | +| ingress.hosts[0] | object | `{"name":"","paths":[{"path":"/","pathType":"Exact","port":"{{ include \"nemo-platform.ingressBackendPort\" . }}","service":"{{ include \"nemo-platform.ingressBackendService\" . }}"},{"path":"/apis","pathType":"Prefix","port":"{{ include \"nemo-platform.ingressBackendPort\" . }}","service":"{{ include \"nemo-platform.ingressBackendService\" . }}"},{"path":"/studio","pathType":"Prefix","port":"{{ include \"nemo-platform.ingressBackendPort\" . }}","service":"{{ include \"nemo-platform.ingressBackendService\" . }}"},{"path":"/cluster-info","pathType":"Exact","port":"{{ include \"nemo-platform.ingressBackendPort\" . }}","service":"{{ include \"nemo-platform.ingressBackendService\" . }}"},{"path":"/status","pathType":"Exact","port":"{{ include \"nemo-platform.ingressBackendPort\" . }}","service":"{{ include \"nemo-platform.ingressBackendService\" . }}"}]}` | Hostname used by ingress. If blank, use path-only routing. | +| ingress.tls | list | `[]` | TLS configurations. | +| k8s-nim-operator.enabled | bool | `true` | Specifies whether to enable the default NIM Operator installation. To learn more, see [Install NIM Operator](https://docs.nvidia.com/nim-operator/latest/install.html). If you are using an existing NIM Operator installation, set this to false. | +| k8s-nim-operator.nfd.nodeFeatureRules.deviceID | bool | `false` | Specifies whether to enable device ID feature rules. | +| multinodeNetworking | object | `{"aws":{"efaDevicesPerGPU":1,"enabled":false},"azure":{"enabled":false,"rdmaDeviceName":"hca_shared_devices_a","rdmaDevicesPerGPU":1},"gcp":{"enabled":false},"oci":{"enabled":false,"rdmaDevicesPerGPU":8}}` | Multi-node networking configuration for distributed GPU training. These settings control Kyverno policies that inject cloud-specific networking and NCCL configurations. Requirements: - Kyverno policy engine must be installed in your cluster (required for multi-node networking) - Kyverno is NOT included as a subchart dependency and must be installed separately To install Kyverno: helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace --version 3.2.0 Documentation: https://kyverno.io/docs/installation/ Helm chart: https://kyverno.github.io/kyverno/ Note: Only enable ONE cloud provider per cluster deployment. | +| multinodeNetworking.aws | object | `{"efaDevicesPerGPU":1,"enabled":false}` | AWS-specific configuration for EFA device injection | +| multinodeNetworking.aws.efaDevicesPerGPU | int | `1` | Number of EFA devices to request per GPU (typically 1 or 4) | +| multinodeNetworking.aws.enabled | bool | `false` | Enable AWS-specific Kyverno policy for EFA device injection | +| multinodeNetworking.azure | object | `{"enabled":false,"rdmaDeviceName":"hca_shared_devices_a","rdmaDevicesPerGPU":1}` | Azure-specific configuration for InfiniBand/RDMA | +| multinodeNetworking.azure.enabled | bool | `false` | Enable Azure-specific Kyverno policy for InfiniBand/RDMA configuration | +| multinodeNetworking.azure.rdmaDeviceName | string | `"hca_shared_devices_a"` | RDMA device plugin resource name | +| multinodeNetworking.azure.rdmaDevicesPerGPU | int | `1` | Number of RDMA devices to request per GPU | +| multinodeNetworking.gcp | object | `{"enabled":false}` | GCP-specific configuration for TCP-X/TCP-XO | +| multinodeNetworking.gcp.enabled | bool | `false` | Enable GCP-specific Kyverno policy for TCP-X/TCP-XO configuration | +| multinodeNetworking.oci | object | `{"enabled":false,"rdmaDevicesPerGPU":8}` | OCI-specific configuration for InfiniBand/SR-IOV | +| multinodeNetworking.oci.enabled | bool | `false` | Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration | +| multinodeNetworking.oci.rdmaDevicesPerGPU | int | `8` | Number of RDMA devices (mlnxnics) to request per GPU | +| nameOverride | string | `""` | Overrides for name and fullname templates | +| ncclTest | object | `{"configMapCleanupJob":{"image":{"repository":"bitnami/kubectl","tag":"latest"}},"gpuNodeLabelKey":"nvidia.com/gpu.present","gpuNodeLabelValue":"true","gpuResourceKey":"nvidia.com/gpu","gpusPerNode":1,"iterations":3,"orchestrator":{"image":{"repository":"docker.io/library/python","tag":"3.12-slim"},"resources":{"limits":{"cpu":"1","memory":"512Mi"},"requests":{"cpu":"100m","memory":"256Mi"}}},"validation":{"minBandwidthMBpsAt1024MB":8000},"waitTimeoutSeconds":900,"worker":{"image":{"repository":"nvcr.io/nvidia/nemo/nmp-automodel-training","tag":""},"resources":{"limits":{"cpu":"8","memory":"16Gi"},"requests":{"cpu":"4","memory":"8Gi"}}}}` | NCCL chart test (`helm test`): multi-node allreduce check. Templates use helm.sh/hook: test — they are not created on install/upgrade, only when you run helm test. Requires nodes labeled with gpuNodeLabelKey/gpuNodeLabelValue (default NFD / GPU operator style). See https://helm.sh/docs/topics/chart_tests/ | +| ncclTest.configMapCleanupJob | object | `{"image":{"repository":"bitnami/kubectl","tag":"latest"}}` | Post-test hook Job (after orchestrator): kubectl deletes the scripts ConfigMap (helm.sh/hook-weight 5). | +| ncclTest.gpuNodeLabelKey | string | `"nvidia.com/gpu.present"` | Node label used to discover GPU workers (must match your cluster). | +| ncclTest.gpuResourceKey | string | `"nvidia.com/gpu"` | Resource name for GPU capacity on worker pods (e.g. nvidia.com/gpu or a MIG device). | +| ncclTest.gpusPerNode | int | `1` | GPUs per worker pod / per node (torch.distributed nproc_per_node). IMPORTANT: Set this value before testing | +| ncclTest.iterations | int | `3` | How many times to run the full multinode NCCL test (orchestrator loop; env NCCL_TEST_ITERATIONS). Increase the test timeout on helm test if increasing this variable | +| ncclTest.validation.minBandwidthMBpsAt1024MB | int | `8000` | Minimum allreduce bandwidth (MB/s) at 1024MB message size; 0 disables the floor check in nccl_test.py. | +| ncclTest.waitTimeoutSeconds | int | `900` | Max seconds to wait for each worker pod to complete. | +| ngcAPIKey | string | `"YOUR-NGC-API-KEY"` | Your NVIDIA GPU Cloud (NGC) API key authenticates and enables pulling images from the NGC container registry. The existing secret overrides this key if you provide one to the `existingSecret` key. | +| openshiftRoute | object | `{"annotations":{},"enabled":false,"host":"","labels":{},"service":"{{ include \"nemo-platform.ingressBackendService\" . }}","targetPort":"{{ include \"nemo-platform.ingressBackendPort\" . }}","tls":{}}` | OpenShift Route (route.openshift.io/v1). Use on OpenShift to expose the API via a Route instead of Ingress. | +| openshiftRoute.annotations | object | `{}` | Annotations for the route resource. | +| openshiftRoute.enabled | bool | `false` | Specifies whether to create an OpenShift Route for the API service. | +| openshiftRoute.host | string | `""` | Hostname for the route. If empty, the OpenShift router may assign a default hostname. | +| openshiftRoute.labels | object | `{}` | Labels for the route resource. | +| openshiftRoute.service | string | `"{{ include \"nemo-platform.ingressBackendService\" . }}"` | Service name to route to. Defaults to Envoy when auth+envoy enabled, otherwise API (tpl-evaluated). | +| openshiftRoute.targetPort | string | `"{{ include \"nemo-platform.ingressBackendPort\" . }}"` | Target port on the service. Defaults to Envoy or API port depending on auth (tpl-evaluated). | +| openshiftRoute.tls | object | `{}` | Optional TLS configuration (termination, certificate, key, etc.). See OpenShift Route spec. | +| platformConfig | object | `{}` | Platform-wide configuration settings Set configuration here to apply custom, structured configuration across all services. Applied after the base platform config is evaluated for templates. Enables adding / overriding YAML-based elements in the evaluated platform config. It is usually recommended to use this config section instead of `basePlatformConfig` unless you need to use templating features. For example, you can set the NIM default StorageClass via models.controller.backends.k8s-nim-operator.config.default_storage_class. For full configuration reference, see the NeMo Platform's config reference: https://docs.nvidia.com/nemo/microservices/latest/set-up/config-reference.html | +| platformSeedJob | object | This object has the following default values for the platform seed Job configuration. | Platform seed Job (Helm hook: runs after install/upgrade) Runs the platform-seed task (guardrails configs, evaluator system entities, data designer filesets). Uses post-install,post-upgrade hooks so it runs on fresh installs and can be re-triggered on no-op upgrade. | +| platformSeedJob.activeDeadlineSeconds | int | `600` | Maximum time in seconds the Job can run. | +| platformSeedJob.affinity | object | `{}` | Affinity for the platform seeding Job pod. | +| platformSeedJob.backoffLimit | int | `6` | Number of retries before considering the Job failed. | +| platformSeedJob.enabled | bool | `true` | Specifies whether to enable the platform-seed Job. | +| platformSeedJob.extraEnv | list | `[]` | Extra environment variables for the platform-seed container (e.g. CONFIG_STORE_PATH, NMP_PLATFORM_SEED_*). | +| platformSeedJob.nodeSelector | object | `{}` | Node selector for the platform seeding Job pod. | +| platformSeedJob.podLabels | object | `{}` | Additional labels for the platform seeding Job pod. | +| platformSeedJob.podSecurityContext | object | `{}` | Pod-level security context for the platform seeding Job pod. | +| platformSeedJob.resources | object | `{}` | Resource requests/limits for the platform-seed container. | +| platformSeedJob.securityContext | object | `{}` | Container-level security context for the platform-seed container. | +| platformSeedJob.tolerations | list | `[]` | Tolerations for the platform seeding Job pod. | +| platformSeedJob.ttlSecondsAfterFinished | int | `86400` | Seconds after the Job finishes (success or failure) before it is eligible for automatic deletion. | +| podSecurityContext | object | This object has the following default values for the pod security context. | Pod security context settings applied to all services by default. These can be overridden in individual service configurations. | +| postgresql | object | This object has the following default values for the PostgreSQL configuration. | Local PostgreSQL configuration for the NeMo Platform. | +| postgresql.affinity | object | `{}` | Affinity for the PostgreSQL pod. | +| postgresql.auth | object | `{"database":"nemoplatform","existingSecret":"","password":"nemo","username":"nemo"}` | PostgreSQL authentication configuration. | +| postgresql.auth.existingSecret | string | `""` | Name of an existing secret containing a "password" key (or use existingSecretPasswordKey). If set, the chart does not create a secret. | +| postgresql.enabled | bool | `true` | Whether to deploy the embedded PostgreSQL. If enabled, the chart deploys a single-replica PostgreSQL instance using the official Postgres image. It is NOT recommended to use the built-in PostgreSQL for production deployments. It is enabled in the chart by default for ease of getting started with the platform. If you are using an existing PostgreSQL installation, set this to false and use the "externalDatabase" configuration section. | +| postgresql.nodeSelector | object | `{}` | Node selector for the PostgreSQL pod. | +| postgresql.persistence | object | `{"enabled":true,"size":"5Gi","storageClass":""}` | PostgreSQL persistence configuration. | +| postgresql.persistence.storageClass | string | `""` | Storage class for the PostgreSQL PVC. If unset, the cluster default is used. | +| postgresql.podSecurityContext | object | `{}` | Optional pod security context for the PostgreSQL pod (e.g. for OpenShift SCC). | +| postgresql.resources | object | `{}` | Optional resource limits/requests for the PostgreSQL container. | +| postgresql.securityContext | object | `{}` | Optional container security context for the PostgreSQL container. | +| postgresql.service | object | `{"port":5432}` | PostgreSQL service configuration. | +| postgresql.serviceAccount | object | This object has the following default values for the service account configuration. | Service account for the PostgreSQL pod. | +| postgresql.serviceAccount.annotations | object | `{}` | Annotations to add to the service account. | +| postgresql.serviceAccount.automount | bool | `true` | Automatically mount the ServiceAccount's API credentials. | +| postgresql.serviceAccount.create | bool | `true` | Specifies whether a service account should be created for the PostgreSQL pod. | +| postgresql.serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated from the release fullname. | +| postgresql.tolerations | list | `[]` | Tolerations for the PostgreSQL pod. | +| rbac | object | `{"k8sNimOperatorEnabled":true,"volcanoEnabled":true}` | RBAC configuration settings for optional dependencies | +| rbac.k8sNimOperatorEnabled | bool | `true` | Specifies whether to enable the core Controller to have RBAC permissions to k8s-nim-operator's NIMService for scheduling NIMs. | +| rbac.volcanoEnabled | bool | `true` | Specifies whether to enable the core Controller to have RBAC permissions to Volcano for scheduling distributed jobs. | +| securityContext | object | This object has the following default values for the container security context. | Container security context settings applied to all services by default. These can be overridden in individual service configurations. | +| telemetry.OTEL_EXPORTER_OTLP_ENDPOINT | string | `""` | The OpenTelemetry grpc collector endpoint to export traces and metrics to. | +| telemetry.OTEL_EXPORTER_OTLP_INSECURE | bool | `true` | Whether to use an insecure connection (no TLS) to the OpenTelemetry collector endpoint. | +| telemetry.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT | string | `nil` | The OpenTelemetry metrics exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set. | +| telemetry.OTEL_EXPORTER_OTLP_METRICS_INSECURE | bool | `true` | Whether to use an insecure connection (HTTP) to the OpenTelemetry metrics exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set. | +| telemetry.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT | string | `nil` | The OpenTelemetry traces exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set. | +| telemetry.OTEL_EXPORTER_OTLP_TRACES_INSECURE | bool | `true` | Whether to use an insecure connection (HTTP) to the OpenTelemetry traces exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set. | +| telemetry.OTEL_METRICS_EXPORTER | string | `"none"` | The OpenTelemetry metrics exporter to use. Options are "otlp", "prometheus" or "none" to disable export. | +| telemetry.OTEL_SDK_DISABLED | bool | `false` | Disable OpenTelemetry instrumentation and exporting for all services. | +| telemetry.OTEL_TRACES_EXPORTER | string | `"none"` | The OpenTelemetry traces exporter to use. Options are "otlp" or "none" to disable export. | diff --git a/k8s/helm/ci/01-preexisting-imagepullsecret.yaml b/k8s/helm/ci/01-preexisting-imagepullsecret.yaml new file mode 100644 index 0000000000..334f0697fc --- /dev/null +++ b/k8s/helm/ci/01-preexisting-imagepullsecret.yaml @@ -0,0 +1 @@ +existingImagePullSecret: "nvcrimagepullsecret" diff --git a/k8s/helm/ci/02-empty-imagepullsecret.yaml b/k8s/helm/ci/02-empty-imagepullsecret.yaml new file mode 100644 index 0000000000..ed5c5054e1 --- /dev/null +++ b/k8s/helm/ci/02-empty-imagepullsecret.yaml @@ -0,0 +1,2 @@ +existingImagePullSecret: "" + diff --git a/k8s/helm/ci/03-global-security-context.yaml b/k8s/helm/ci/03-global-security-context.yaml new file mode 100644 index 0000000000..e5edcbc462 --- /dev/null +++ b/k8s/helm/ci/03-global-security-context.yaml @@ -0,0 +1,15 @@ +# Test global securityContext and podSecurityContext values +# These should be merged with component-specific values + +podSecurityContext: + runAsNonRoot: true + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + +securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL diff --git a/k8s/helm/ci/04-security-context-override.yaml b/k8s/helm/ci/04-security-context-override.yaml new file mode 100644 index 0000000000..c9e0c9fb33 --- /dev/null +++ b/k8s/helm/ci/04-security-context-override.yaml @@ -0,0 +1,30 @@ +# Test that component-specific security context values override global values + +# Global defaults +podSecurityContext: + runAsNonRoot: true + fsGroup: 1000 + +securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + +# Component overrides - these should take precedence over globals +api: + podSecurityContext: + fsGroup: 2000 # Override global fsGroup + runAsUser: 1001 + securityContext: + readOnlyRootFilesystem: false # Override global setting + +core: + api: + podSecurityContext: + fsGroup: 3000 + securityContext: + runAsUser: 1002 + controller: + podSecurityContext: + fsGroup: 4000 + securityContext: + runAsUser: 1003 diff --git a/k8s/helm/ci/05-enable-auth.yaml b/k8s/helm/ci/05-enable-auth.yaml new file mode 100644 index 0000000000..8f0533d478 --- /dev/null +++ b/k8s/helm/ci/05-enable-auth.yaml @@ -0,0 +1,5 @@ +# Test that authentication is enabled on the platform. + +platformConfig: + auth: + enabled: true diff --git a/k8s/helm/ci/06-kyverno-aws.yaml b/k8s/helm/ci/06-kyverno-aws.yaml new file mode 100644 index 0000000000..2e5019e089 --- /dev/null +++ b/k8s/helm/ci/06-kyverno-aws.yaml @@ -0,0 +1,6 @@ +# Test AWS Kyverno policy for multi-node GPU training with EFA + +multinodeNetworking: + aws: + enabled: true + efaDevicesPerGPU: 1 diff --git a/k8s/helm/ci/07-kyverno-azure.yaml b/k8s/helm/ci/07-kyverno-azure.yaml new file mode 100644 index 0000000000..084aeeb72c --- /dev/null +++ b/k8s/helm/ci/07-kyverno-azure.yaml @@ -0,0 +1,7 @@ +# Test Azure Kyverno policy for multi-node GPU training with RDMA + +multinodeNetworking: + azure: + enabled: true + rdmaDevicesPerGPU: 1 + rdmaDeviceName: "hca_shared_devices_a" diff --git a/k8s/helm/ci/08-kyverno-gcp.yaml b/k8s/helm/ci/08-kyverno-gcp.yaml new file mode 100644 index 0000000000..1e5ee11820 --- /dev/null +++ b/k8s/helm/ci/08-kyverno-gcp.yaml @@ -0,0 +1,5 @@ +# Test GCP Kyverno policy for multi-node GPU training with TCP-X + +multinodeNetworking: + gcp: + enabled: true diff --git a/k8s/helm/ci/09-kyverno-oci.yaml b/k8s/helm/ci/09-kyverno-oci.yaml new file mode 100644 index 0000000000..6e114076c3 --- /dev/null +++ b/k8s/helm/ci/09-kyverno-oci.yaml @@ -0,0 +1,6 @@ +# Test OCI Kyverno policy for multi-node GPU training with RDMA + +multinodeNetworking: + oci: + enabled: true + rdmaDevicesPerGPU: 8 diff --git a/k8s/helm/ci/10-override-image-registry.yaml b/k8s/helm/ci/10-override-image-registry.yaml new file mode 100644 index 0000000000..32070b74b9 --- /dev/null +++ b/k8s/helm/ci/10-override-image-registry.yaml @@ -0,0 +1,11 @@ +api: + image: + repository: registry.example.com/nemo-platform/nmp-api + +core: + image: + repository: registry.example.com/nemo-platform/nmp-api + +platformConfig: + platform: + image_registry: registry.example.com/nemo-platform diff --git a/k8s/helm/ci/11-setting-env.yaml b/k8s/helm/ci/11-setting-env.yaml new file mode 100644 index 0000000000..4cd71231cf --- /dev/null +++ b/k8s/helm/ci/11-setting-env.yaml @@ -0,0 +1,17 @@ +env: + MY_ENV_VAR: + valueFrom: + secretKeyRef: + key: foo + value: bar + ANOTHER_VAR: bloop + THIS_WILL_BE_QUOTED_BY_TEMPLATE: true + +api: + env: + HELLO: THERE + +core: + controller: + env: + WHAT: 22 diff --git a/k8s/helm/ci/12-topology-spread-constraints.yaml b/k8s/helm/ci/12-topology-spread-constraints.yaml new file mode 100644 index 0000000000..d9e218bc07 --- /dev/null +++ b/k8s/helm/ci/12-topology-spread-constraints.yaml @@ -0,0 +1,22 @@ +# Test that topologySpreadConstraints are correctly templated for API and core controller deployments. +# CI validates by running: helm template nemo-platform . -f ci/12-topology-spread-constraints.yaml +# The rendered output must include topologySpreadConstraints in both deployment pod specs. + +api: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/component: nmp-api + +core: + controller: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/component: nmp-core-controller diff --git a/k8s/helm/ci/13-openshift.yaml b/k8s/helm/ci/13-openshift.yaml new file mode 100644 index 0000000000..a4a0da033a --- /dev/null +++ b/k8s/helm/ci/13-openshift.yaml @@ -0,0 +1,31 @@ +# OpenShift-friendly security context for restricted / restricted-v2 SCC. +# Use with: helm template k8s/helm -f k8s/helm/ci/13-openshift.yaml +# See k8s/helm/README.md "OpenShift" section. + +podSecurityContext: + runAsNonRoot: true + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + +# Embedded PostgreSQL: run as non-root so postgres pods pass OpenShift SCC +postgresql: + securityContext: + runAsUser: 999 + runAsNonRoot: true + podSecurityContext: + runAsNonRoot: true + fsGroup: 999 + +# OpenShift Route (route.openshift.io/v1) for API exposure +# TODO: Add a way to test this via kubeconform or otherwise. +# Since this isn't in the default kube spec, it won't validate correctly. +#openshiftRoute: +# enabled: true +# host: "nmp.example.com" diff --git a/k8s/helm/ci/14-pdb-min-available.yaml b/k8s/helm/ci/14-pdb-min-available.yaml new file mode 100644 index 0000000000..bd4ca3460d --- /dev/null +++ b/k8s/helm/ci/14-pdb-min-available.yaml @@ -0,0 +1,9 @@ +# Test that PodDisruptionBudget renders with minAvailable. +# CI validates by running: helm template nemo-platform . -f ci/14-pdb-min-available.yaml +# The rendered output must include a PodDisruptionBudget for the API with spec.minAvailable: 2. + +api: + podDisruptionBudget: + enabled: true + minAvailable: 2 + diff --git a/k8s/helm/ci/14-secretfromenv.yaml b/k8s/helm/ci/14-secretfromenv.yaml new file mode 100644 index 0000000000..138af4b793 --- /dev/null +++ b/k8s/helm/ci/14-secretfromenv.yaml @@ -0,0 +1,7 @@ +# Test envFromSecret: use an external secret for API pod env vars (no default api-env secret created). +# CI validates by running: helm template nemo-platform . -f ci/14-secretfromenv.yaml +# When envFromSecret is set: the api-env Secret template must not render; the API deployment +# must reference this secret name in envFrom. + +envFromSecret: "my-external-api-env" + diff --git a/k8s/helm/ci/15-pdb-max-unavailable.yaml b/k8s/helm/ci/15-pdb-max-unavailable.yaml new file mode 100644 index 0000000000..56fa19f240 --- /dev/null +++ b/k8s/helm/ci/15-pdb-max-unavailable.yaml @@ -0,0 +1,9 @@ +# Test that PodDisruptionBudget renders with maxUnavailable. +# CI validates by running: helm template nemo-platform . -f ci/15-pdb-max-unavailable.yaml +# The rendered output must include a PodDisruptionBudget for the API with spec.maxUnavailable: 1. + +api: + podDisruptionBudget: + enabled: true + maxUnavailable: 1 + diff --git a/k8s/helm/ci/16-enable-ingress.yaml b/k8s/helm/ci/16-enable-ingress.yaml new file mode 100644 index 0000000000..896d8fc73f --- /dev/null +++ b/k8s/helm/ci/16-enable-ingress.yaml @@ -0,0 +1,6 @@ +# CI validates by running: helm template nemo-platform . -f ci/16-enable-ingress.yaml +# Tests that enabling ingress renders a valid Ingress resource (path-only by default). + +ingress: + enabled: true + diff --git a/k8s/helm/ci/17-ingress-default-host.yaml b/k8s/helm/ci/17-ingress-default-host.yaml new file mode 100644 index 0000000000..852bc39f0b --- /dev/null +++ b/k8s/helm/ci/17-ingress-default-host.yaml @@ -0,0 +1,7 @@ +# CI validates by running: helm template nemo-platform . -f ci/17-ingress-default-host.yaml +# Tests that setting ingress.defaultHost renders one Ingress rule with that host and default paths (backend is API service). + +ingress: + enabled: true + defaultHost: "nmp.example.com" + diff --git a/k8s/helm/ci/18-enable-httproute.yaml b/k8s/helm/ci/18-enable-httproute.yaml new file mode 100644 index 0000000000..2cda9e34dc --- /dev/null +++ b/k8s/helm/ci/18-enable-httproute.yaml @@ -0,0 +1,9 @@ +# CI validates by running: helm template nemo-platform . -f ci/18-enable-httproute.yaml +# Tests that enabling Gateway API HTTPRoute renders a valid HTTPRoute (parentRefs required when enabled). + +httpRoute: + enabled: true + parentRefs: + - name: ci-gateway + namespace: default + diff --git a/k8s/helm/ci/19-envoy-extra-args.yaml b/k8s/helm/ci/19-envoy-extra-args.yaml new file mode 100644 index 0000000000..e63018309d --- /dev/null +++ b/k8s/helm/ci/19-envoy-extra-args.yaml @@ -0,0 +1,11 @@ +# CI validates by running: helm template nemo-platform . -f ci/19-envoy-extra-args.yaml +# Ensures envoy deployment renders when envoyProxy.extraArgs is set (auth + envoy enabled). + +platformConfig: + auth: + enabled: true + +envoyProxy: + extraArgs: + - --concurrency + - "2" diff --git a/k8s/helm/files/nccl-test/entrypoint.sh b/k8s/helm/files/nccl-test/entrypoint.sh new file mode 100644 index 0000000000..4b3d0bd9c7 --- /dev/null +++ b/k8s/helm/files/nccl-test/entrypoint.sh @@ -0,0 +1,147 @@ +#!/bin/bash +set -e + +echo "=== NCCL Multi-node RDMA Test ===" +echo "Node: $(hostname)" +echo "Date: $(date)" + +if [ -f /platform-config/nccl-env.sh ]; then + echo "Loading universal NCCL configuration..." + # shellcheck source=/dev/null + source /platform-config/nccl-env.sh +else + echo "Warning: No universal config found" +fi + +for platform_config in /platform-config/*-env.sh; do + if [ -f "$platform_config" ]; then + echo "Loading platform-specific config: $(basename "$platform_config")" + # shellcheck source=/dev/null + source "$platform_config" + fi +done + +echo "=== Environment Setup ===" +echo "NCCL_DEBUG: $NCCL_DEBUG" +echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME" + +echo "=== Hardware Detection ===" +nvidia-smi -L + +echo "=== Network Device Detection ===" +if command -v ip >/dev/null 2>&1; then + ip addr show | grep -E "^[0-9]+:" | head -10 +fi + +# RDMA / IB checks (aligned with rdma-debug-test.yaml and network-operator README) +echo "=== RDMA Debug (in-pod) ===" +if command -v lspci >/dev/null 2>&1; then + echo "--- PCI Mellanox (lspci) ---" + lspci 2>/dev/null | grep -i mellanox || echo "(no Mellanox PCI lines)" +else + echo "(lspci not installed; skip PCI check)" +fi + +if [ -d /sys/class/infiniband ] && [ "$(ls -A /sys/class/infiniband 2>/dev/null)" ]; then + echo "--- /sys/class/infiniband ---" + ls -la /sys/class/infiniband/ || true + echo "--- RDMA / infiniband devices under /dev ---" + find /dev -maxdepth 2 \( -name "umad*" -o -name "uverbs*" -o -path "/dev/infiniband/*" \) 2>/dev/null | head -30 || true + if [ -d /dev/infiniband ]; then + ls -la /dev/infiniband/ || true + else + echo "(no /dev/infiniband directory)" + fi + if command -v ibv_devinfo >/dev/null 2>&1; then + echo "--- ibv_devinfo ---" + ibv_devinfo || true + else + echo "(ibv_devinfo not in PATH)" + fi +else + echo "(no InfiniBand class devices visible in this pod — expected on socket/AWS-only setups)" +fi + +if command -v lsmod >/dev/null 2>&1; then + echo "--- Kernel modules (mlx / ib_ / rdma) ---" + lsmod | grep -E "(mlx|ib_|rdma)" || echo "(none matched in this mount namespace)" +fi + +if [ "${NCCL_TEST_STRICT_IB_PORT_ACTIVE:-false}" = "true" ]; then + if [ -d /sys/class/infiniband ] && [ "$(ls -A /sys/class/infiniband 2>/dev/null)" ]; then + if command -v ibv_devinfo >/dev/null 2>&1; then + if ibv_devinfo > /tmp/ibv_precheck.out 2>&1; then + if ! grep -q "PORT_ACTIVE" /tmp/ibv_precheck.out; then + echo "ERROR: InfiniBand devices present but ibv_devinfo shows no PORT_ACTIVE (see network-operator README)." + cat /tmp/ibv_precheck.out + exit 1 + fi + echo "✓ ibv_devinfo reports PORT_ACTIVE" + fi + fi + fi +fi + +if command -v fi_info >/dev/null 2>&1; then + echo "=== EFA Devices ===" + fi_info -p efa || echo "No EFA provider found" +fi + +# NODE_RANK 0 is the rendezvous leader node; LEADER_ADDR is rank-0 pod IP. Skip waiting for self-DNS on that node. +NUM_NODES="${NUM_NODES:-1}" +NPROC_PER_NODE="${NPROC_PER_NODE:-1}" +NODE_RANK="${NODE_RANK:-0}" + +if [ "${NODE_RANK}" = "0" ]; then + echo "NODE_RANK 0 (leader node): skipping wait for LEADER_ADDR DNS ($LEADER_ADDR)" +else + echo "Waiting for $LEADER_ADDR to be available" + while true; do + if getent hosts "$LEADER_ADDR" >/dev/null 2>&1; then + resolved_ip=$(getent hosts "$LEADER_ADDR" | awk '{print $1}') + echo "Successfully resolved $LEADER_ADDR to $resolved_ip" + break + fi + echo "Failed to resolve $LEADER_ADDR, retrying in 2 seconds..." + sleep 2 + done +fi + +echo "=== Starting NCCL AllReduce Test ===" +echo "torch.distributed.run: nnodes=${NUM_NODES} node_rank=${NODE_RANK} nproc_per_node=${NPROC_PER_NODE} master=${LEADER_ADDR}:${MASTER_PORT}" + +# One process per GPU; global world size = NUM_NODES * NPROC_PER_NODE +if ! python3 -m torch.distributed.run \ + --nnodes="${NUM_NODES}" \ + --node_rank="${NODE_RANK}" \ + --nproc_per_node="${NPROC_PER_NODE}" \ + --master_addr="${LEADER_ADDR}" \ + --master_port="${MASTER_PORT}" \ + /scripts/nccl_test.py; then + echo "ERROR: torch.distributed.run / nccl_test.py failed" >&2 + exit 1 +fi + +echo "=== NCCL Test Complete ===" + +log_file="${NCCL_DEBUG_FILE:-/tmp/nccl_debug.log}" + +if [ -f "$log_file" ]; then + echo "--- NCCL debug log (NET transport lines) ---" + grep "NET/" "$log_file" 2>/dev/null || true +fi + +if [ "${NCCL_TEST_EXPECT_IB_TRANSPORT:-false}" = "true" ]; then + if [ ! -f "$log_file" ]; then + echo "ERROR: expected NET/IB check but $log_file is missing" + exit 1 + fi + if ! grep -q "NET/IB" "$log_file"; then + echo "ERROR: expected NCCL NET/IB (InfiniBand); multicloud expects e.g. NET/IB : Using ... mlx5_* . Got:" + grep "NET/" "$log_file" 2>/dev/null || echo "(no NET/ lines)" + exit 1 + fi + echo "✓ NCCL debug log contains NET/IB" +fi + +cat "$log_file" 2>/dev/null || echo "No NCCL debug log found" diff --git a/k8s/helm/files/nccl-test/nccl-env.sh b/k8s/helm/files/nccl-test/nccl-env.sh new file mode 100644 index 0000000000..33bb203b82 --- /dev/null +++ b/k8s/helm/files/nccl-test/nccl-env.sh @@ -0,0 +1,14 @@ +# Universal NCCL configuration (helm NCCL test + multicloud baseline) +export NCCL_DEBUG=INFO +export NCCL_DEBUG_SUBSYS=ALL +export NCCL_DEBUG_FILE=/tmp/nccl_debug.log +export NCCL_TREE_THRESHOLD=0 +export NCCL_RING_THRESHOLD=8 +export NCCL_BUFFSIZE=8388608 +export NCCL_NTHREADS=32 +export NCCL_MAX_NCHANNELS=32 + +echo "=== Universal NCCL Configuration Loaded ===" +echo "NCCL_DEBUG: $NCCL_DEBUG" +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +echo "NCCL_BUFFSIZE: $NCCL_BUFFSIZE" diff --git a/k8s/helm/files/nccl-test/nccl_test.py b/k8s/helm/files/nccl-test/nccl_test.py new file mode 100644 index 0000000000..8253b2f1e6 --- /dev/null +++ b/k8s/helm/files/nccl-test/nccl_test.py @@ -0,0 +1,109 @@ +import os +import sys +import time +from datetime import timedelta + +import torch +import torch.distributed as dist + + +def run_nccl_test(): + """Run under torch.distributed.run (torchrun): one process per GPU, global all_reduce.""" + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + + if not os.environ.get("MASTER_ADDR"): + os.environ["MASTER_ADDR"] = os.environ.get("LEADER_ADDR", "127.0.0.1") + + os.environ.setdefault("MASTER_PORT", "29500") + + master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1") + + print("=== NCCL Test Debug Info ===") + print(f"Rank: {rank}") + print(f"Local Rank: {local_rank}") + print(f"World Size: {world_size}") + print(f"Master Address: {master_addr}") + print(f"CUDA Available: {torch.cuda.is_available()}") + print(f"CUDA Device Count: {torch.cuda.device_count()}") + if torch.cuda.is_available(): + nvis = torch.cuda.device_count() + if nvis <= local_rank: + print( + f"ERROR: LOCAL_RANK={local_rank} but only {nvis} visible GPU(s)", + file=sys.stderr, + ) + sys.exit(1) + torch.cuda.set_device(local_rank) + print(f"Current CUDA Device: {torch.cuda.current_device()}") + print(f"Device Name: {torch.cuda.get_device_name(local_rank)}") + else: + print("ERROR: CUDA is not available; NCCL test requires GPUs.", file=sys.stderr) + sys.exit(1) + print("================================") + + print("Environment variables set:") + print(f" MASTER_ADDR: {os.environ.get('MASTER_ADDR', '')}") + print(f" MASTER_PORT: {os.environ.get('MASTER_PORT', '')}") + print(f" RANK: {os.environ.get('RANK', '')}") + print(f" WORLD_SIZE: {os.environ.get('WORLD_SIZE', '')}") + print(f" LOCAL_RANK: {os.environ.get('LOCAL_RANK', '')}") + + print(f"Initializing NCCL backend... Rank: {rank}, World: {world_size}, Master: {master_addr}") + dist.init_process_group(backend="nccl", timeout=timedelta(minutes=5)) + + print(f"Rank: {dist.get_rank()}, World Size: {dist.get_world_size()}") + print(f"CUDA Device: {torch.cuda.current_device()}") + + min_bw_raw = os.environ.get("NCCL_TEST_MIN_BANDWIDTH_MBPS", "0") or "0" + try: + min_bw = float(min_bw_raw) + except ValueError: + min_bw = 0.0 + + test_sizes = [1, 4, 16, 64, 256, 1024, 4096] + bw_max_large = 0 + + for size_mb in test_sizes: + elements = (size_mb * 1024 * 1024) // 4 + tensor = torch.randn(elements, device="cuda") + + print(f"Testing {size_mb}MB tensor ({elements} elements)...") + + for _ in range(5): + dist.all_reduce(tensor) + torch.cuda.synchronize() + + start_time = time.time() + for _ in range(10): + dist.all_reduce(tensor) + torch.cuda.synchronize() + end_time = time.time() + + avg_time = (end_time - start_time) / 10 + bandwidth = (size_mb * 2) / avg_time + + print(f"Size: {size_mb}MB, Time: {avg_time:.4f}s, Bandwidth: {bandwidth:.2f} MB/s") + if size_mb == 1024 or size_mb == 4096: + if bandwidth > bw_max_large: + bw_max_large = bandwidth + + slow = min_bw > 0.0 and bw_max_large < min_bw + if slow: + print( + f"ERROR: AllReduce best bandwidth at large message sizes (1024MB/4096MB) " + f"bw_max_large={bw_max_large:.2f} MB/s is below minimum {min_bw:.2f} MB/s " + "(slow or misconfigured interconnect; expect ~8000+ MB/s on IB + GPU Direct RDMA).", + file=sys.stderr, + ) + dist.destroy_process_group() + sys.exit(1) + + print("NCCL test completed successfully!") + dist.barrier() + dist.destroy_process_group() + + +if __name__ == "__main__": + run_nccl_test() diff --git a/k8s/helm/files/nccl-test/orchestrator.py b/k8s/helm/files/nccl-test/orchestrator.py new file mode 100644 index 0000000000..156d4f9614 --- /dev/null +++ b/k8s/helm/files/nccl-test/orchestrator.py @@ -0,0 +1,602 @@ +#!/usr/bin/env python3 +"""Discover GPU nodes, run one NCCL worker pod per node, then delete workers.""" + +import os +import sys +import time +import traceback + +from kubernetes import client, config +from kubernetes.client import ( + V1Capabilities, + V1ConfigMapVolumeSource, + V1Container, + V1ContainerPort, + V1EnvVar, + V1EnvVarSource, + V1KeyToPath, + V1LocalObjectReference, + V1ObjectFieldSelector, + V1ObjectMeta, + V1OwnerReference, + V1Pod, + V1PodSpec, + V1ResourceRequirements, + V1SecurityContext, + V1Volume, + V1VolumeMount, +) +from kubernetes.client.rest import ApiException + + +def _truthy(val, default=False): + if val is None: + return default + return str(val).lower() in ("1", "true", "yes", "y") + + +def _env(name, default=None): + v = os.environ.get(name, default) + if v is None or v == "": + if default is not None: + return default + raise RuntimeError(f"missing env {name}") + return v + + +def _node_ready(node): + for c in node.status.conditions or []: + if c.type == "Ready" and c.status == "True": + return True + return False + + +def _wait_pod_success(v1, ns, name, timeout_s): + deadline = time.time() + float(timeout_s) + while time.time() < deadline: + pod = v1.read_namespaced_pod(name, ns) + phase = pod.status.phase + if phase == "Succeeded": + return + if phase == "Failed": + reason = pod.status.reason or "" + msg = f"pod {name} Failed: {reason}" + raise RuntimeError(msg) + time.sleep(3) + _print_pod_status_snapshot(v1, ns, name) + _dump_events(v1, ns, [name]) + raise TimeoutError(f"pod {name} did not succeed within {timeout_s}s") + + +def _wait_pod_running(v1, ns, name, timeout_s=300): + """Wait until pod phase is Running (has IP; needed before reading leader podIP for followers).""" + deadline = time.time() + float(timeout_s) + while time.time() < deadline: + pod = v1.read_namespaced_pod(name, ns) + phase = pod.status.phase + if phase == "Running": + return pod + if phase in ("Failed", "Succeeded"): + reason = pod.status.reason or "" + raise RuntimeError(f"pod {name} entered {phase} before Running: {reason}") + time.sleep(1) + raise TimeoutError(f"pod {name} did not reach Running within {timeout_s}s") + + +def _wait_pod_absent(v1, ns, name, timeout_s=120): + """Wait until the pod is gone from the API (delete has finished).""" + deadline = time.time() + float(timeout_s) + while time.time() < deadline: + try: + v1.read_namespaced_pod(name, ns) + except ApiException as e: + if e.status == 404: + return + raise + time.sleep(1) + raise TimeoutError(f"pod {name} still present after {timeout_s}s") + + +def _dump_logs(v1, ns, names): + for name in names: + try: + logs = v1.read_namespaced_pod_log(name, ns, container="nccl-test") + print(f"---- logs {name} ----\n{logs}") + except ApiException as e: + print(f"(no logs for {name}) {e}") + + +def _dump_events(v1, ns, names): + """Print Kubernetes Events for worker pods (scheduling / mount / OOM hints on failure).""" + if not names: + return + print("---- Kubernetes events (worker pods) ----") + for name in names: + try: + evs = v1.list_namespaced_event( + ns, + field_selector=f"involvedObject.name={name},involvedObject.kind=Pod", + ) + except ApiException as e: + print(f"(could not list events for {name}) {e}") + continue + items = evs.items or [] + if not items: + print(f"(no events for pod {name})") + continue + for ev in sorted( + items, + key=lambda x: str(x.last_timestamp or x.first_timestamp or ""), + ): + typ = ev.type or "" + reason = ev.reason or "" + msg = (ev.message or "").replace("\n", " ") + cnt = ev.count or 0 + print(f" {name}: [{typ}] {reason} (x{cnt}) {msg}") + + +def _print_pod_status_snapshot(v1, ns, name): + """Print latest Pod status (phase, conditions, container state) for debugging timeouts.""" + try: + pod = v1.read_namespaced_pod(name, ns) + except ApiException as e: + print(f"(could not read pod {name} for status: {e})") + return + st = pod.status + print(f"---- pod status snapshot: {name} ----") + print(f" phase: {st.phase}") + if st.message: + print(f" message: {st.message}") + if st.reason: + print(f" reason: {st.reason}") + for c in st.conditions or []: + print( + f" condition {c.type}: status={c.status} " + f"reason={c.reason or ''} message={(c.message or '').replace(chr(10), ' ')}" + ) + for cs in st.container_statuses or []: + parts = [f"ready={cs.ready}", f"restart_count={cs.restart_count}"] + state = cs.state + if state.waiting: + parts.append(f"waiting reason={state.waiting.reason} msg={state.waiting.message or ''}") + elif state.terminated: + t = state.terminated + parts.append(f"terminated exit={t.exit_code} reason={t.reason} msg={t.message or ''}") + elif state.running: + parts.append("running") + print(f" container {cs.name}: {'; '.join(parts)}") + + +def _delete_workers(v1, ns, names): + for name in names: + try: + v1.delete_namespaced_pod(name, ns, grace_period_seconds=0) + print(f"Delete issued for pod {name}") + except ApiException as ae: + if ae.status != 404: + print( + f"warn: could not delete {name}: {ae}", + file=sys.stderr, + ) + continue + _wait_pod_absent(v1, ns, name) + + +def _log_node_interconnect_hints(nodes, rdma_resource): + """Summarize labels/capacity for multinode networking (cloud-agnostic hints).""" + for n in nodes: + lb = n.metadata.labels or {} + alloc = n.status.allocatable or {} + primary = f"alloc[{rdma_resource}]={alloc.get(rdma_resource, '')} " if rdma_resource else "" + print( + f"Node {n.metadata.name}: mellanox={lb.get('nvidia.com/mellanox.present', '')} " + f"rdma_capable={lb.get('feature.node.kubernetes.io/rdma.capable', '')} " + f"{primary}" + f"efa={alloc.get('vpc.amazonaws.com/efa', '')} " + f"mlnxnics={alloc.get('nvidia.com/mlnxnics', '')}" + ) + + +def _print_leader_success_log(v1, ns, worker_pod_name): + """Print full nccl-test container log from the leader worker (rank-0 node) after success.""" + try: + logs = v1.read_namespaced_pod_log(worker_pod_name, ns, container="nccl-test") + except ApiException as e: + print(f"(could not load leader worker log for {worker_pod_name}: {e})") + return + print(f"================ {worker_pod_name} leader (nccl-test) ================================") + print(logs.rstrip()) + print("================================================================================") + + +def _resource_qty_str(q): + if q is None: + return "" + return str(q).strip() + + +def _assert_pod_interconnect_from_kyverno(pod, resource_name, want_req, want_lim): + """Assert pod spec has interconnect resource requests/limits (Kyverno mutate policy).""" + containers = pod.spec.containers or [] + if not containers: + raise RuntimeError("pod has no containers") + c0 = containers[0] + res = c0.resources + if not res: + raise RuntimeError("pod container has no resources") + req = res.requests or {} + lim = res.limits or {} + gr = req.get(resource_name) + gl = lim.get(resource_name) + if gr is None: + raise RuntimeError( + f"expected Kyverno to inject resources.requests[{resource_name!r}]={want_req!r}; got requests={dict(req)}" + ) + if gl is None: + raise RuntimeError( + f"expected Kyverno to inject resources.limits[{resource_name!r}]={want_lim!r}; got limits={dict(lim)}" + ) + if _resource_qty_str(gr) != _resource_qty_str(want_req): + raise RuntimeError(f"Kyverno requests[{resource_name}]: got {gr!r}, want {want_req!r}") + if _resource_qty_str(gl) != _resource_qty_str(want_lim): + raise RuntimeError(f"Kyverno limits[{resource_name}]: got {gl!r}, want {want_lim!r}") + + +def _wait_assert_kyverno_interconnect(v1, namespace, pod_name, resource_name, want_req, want_lim, timeout_s=30): + """Poll until pod spec shows Kyverno-injected interconnect resources (admission is usually sync).""" + deadline = time.time() + float(timeout_s) + last_err = None + while time.time() < deadline: + pod = v1.read_namespaced_pod(pod_name, namespace) + try: + _assert_pod_interconnect_from_kyverno(pod, resource_name, want_req, want_lim) + print(f"Kyverno assertion ok: pod {pod_name} has {resource_name} requests={want_req} limits={want_lim}") + return + except RuntimeError as e: + last_err = e + time.sleep(0.5) + raise last_err or RuntimeError("Kyverno interconnect assertion failed") + + +def _ensure_rdma_allocatable(nodes, resource_name): + missing = [] + for n in nodes: + alloc = n.status.allocatable or {} + if not alloc.get(resource_name): + missing.append(n.metadata.name) + if missing: + raise RuntimeError( + f"requireRdmaAllocatable: nodes missing {resource_name} in allocatable: {missing}. " + "See verify-multinode-setup.sh / network-operator docs." + ) + + +def config_test(iteration: int, print_logs: bool): + namespace = _env("NAMESPACE") + fullname = _env("TEST_FULLNAME") + label_key = _env("GPU_NODE_LABEL_KEY") + label_value = _env("GPU_NODE_LABEL_VALUE") + worker_image = _env("WORKER_IMAGE") + scripts_cm = _env("SCRIPTS_CONFIGMAP_NAME") + master_port = _env("MASTER_PORT", "29500") + timeout_s = int(_env("WAIT_TIMEOUT_SECONDS", "900")) + + image_pull_secret = os.environ.get("IMAGE_PULL_SECRET", "") + release_name = os.environ.get("RELEASE_NAME", "") + + gpu_req = _env("WORKER_GPU_REQUEST", "1") + gpu_resource_key = os.environ.get("WORKER_GPU_RESOURCE_KEY", "nvidia.com/gpu") + try: + gpu_n = max(1, int(float(gpu_req))) + except (ValueError, TypeError): + gpu_n = 1 + + cpu_req = _env("WORKER_CPU_REQUEST", "4") + cpu_lim = _env("WORKER_CPU_LIMIT", "8") + mem_req = _env("WORKER_MEMORY_REQUEST", "8Gi") + mem_lim = _env("WORKER_MEMORY_LIMIT", "16Gi") + + config.load_incluster_config() + v1 = client.CoreV1Api() + + pod_name = os.environ.get("POD_NAME") or os.environ.get("HOSTNAME", "") + if not pod_name: + print("ERROR: POD_NAME or HOSTNAME must be set", file=sys.stderr) + return 1 + hook = os.environ.get("HELM_HOOK", "test") + hook_delete = os.environ.get( + "HELM_HOOK_DELETE_POLICY", + "before-hook-creation,hook-succeeded,hook-failed", + ) + + try: + orch_pod = v1.read_namespaced_pod(pod_name, namespace) + except ApiException as e: + print( + f"ERROR: cannot read orchestrator pod {pod_name}: {e}", + file=sys.stderr, + ) + return 1 + + job_owner = None + for ref in orch_pod.metadata.owner_references or []: + if ref.kind == "Job": + job_owner = ref + break + + owner_refs = None + if job_owner is not None: + owner_refs = [ + V1OwnerReference( + api_version=job_owner.api_version, + kind=job_owner.kind, + name=job_owner.name, + uid=job_owner.uid, + controller=False, + block_owner_deletion=False, + ) + ] + else: + print( + "WARN: orchestrator pod has no Job ownerReference; workers will not be tied to hook Job GC", + file=sys.stderr, + ) + + selector = label_key + "=" + label_value + all_nodes = v1.list_node(label_selector=selector).items + nodes = sorted( + [n for n in all_nodes if _node_ready(n)], + key=lambda n: n.metadata.name, + ) + if not nodes: + print( + f"ERROR: No Ready nodes match GPU label selector {selector}", + file=sys.stderr, + ) + return 1 + + world_size = len(nodes) + global_world = world_size * gpu_n + print( + f"Discovered {world_size} GPU node(s): {[n.metadata.name for n in nodes]}; " + f"{gpu_n} GPU(s) per node → {global_world} global NCCL ranks" + ) + + rdma_alloc_res = os.environ.get("RDMA_ALLOCATABLE_RESOURCE", "").strip() + + _log_node_interconnect_hints(nodes, rdma_alloc_res) + + if _truthy(os.environ.get("REQUIRE_RDMA_ALLOCATABLE")): + if not rdma_alloc_res: + print( + "ERROR: REQUIRE_RDMA_ALLOCATABLE is true but RDMA_ALLOCATABLE_RESOURCE is empty " + "(template bug or unsupported cloud for allocatable checks).", + file=sys.stderr, + ) + return 1 + _ensure_rdma_allocatable(nodes, rdma_alloc_res) + print(f"All {world_size} GPU nodes advertise {rdma_alloc_res} in allocatable.") + + min_bw = os.environ.get("NCCL_TEST_MIN_BANDWIDTH_MBPS", "0") + exp_ib = _truthy(os.environ.get("NCCL_TEST_EXPECT_IB_TRANSPORT")) + strict_ib = _truthy(os.environ.get("NCCL_TEST_STRICT_IB_PORT_ACTIVE")) + print( + f"Validation flags: minBandwidth1024MB={min_bw} MB/s expectIbTransport={exp_ib} strictIbPortActive={strict_ib}" + ) + + worker_net = _truthy(os.environ.get("WORKER_INTERCONNECT_RESOURCE_ENABLED")) + wn_name = os.environ.get("WORKER_INTERCONNECT_RESOURCE_NAME", "").strip() + wn_req = os.environ.get("WORKER_INTERCONNECT_RESOURCE_REQUEST", "0") + wn_lim = os.environ.get("WORKER_INTERCONNECT_RESOURCE_LIMIT", wn_req) + # When true, we annotate disable-rdma-injection and must set interconnect resources ourselves. + injection_disabled = not _truthy(os.environ.get("NCCL_TEST_ALLOW_PLATFORM_INJECTION", "true")) + + worker_ann = { + "helm.sh/hook": hook, + "helm.sh/hook-delete-policy": hook_delete, + } + if _truthy(os.environ.get("NCCL_TEST_KYVERNO_ENABLE_MULTI_NODE", "true")): + worker_ann["nmp.nvidia.com/enable-multi-node-networking"] = "true" + worker_ann["nmp.nvidia.com/num-nodes"] = str(world_size) + if not _truthy(os.environ.get("NCCL_TEST_ALLOW_PLATFORM_INJECTION", "true")): + worker_ann["disable-rdma-injection"] = "true" + + created_names = [] + + def worker_pod_name(rank): + return f"{fullname}-w-{rank}" + + # Followers use rank-0 pod IP for LEADER_ADDR (PyTorch MASTER_ADDR). Headless DNS + # can resolve on the leader pod but fail on other nodes (NodeLocal DNS / split views). + leader_ip = None + + image_pull_secrets = [] + if image_pull_secret: + image_pull_secrets.append(V1LocalObjectReference(name=image_pull_secret)) + + def _make_worker_pod(rank, hostname, leader_addr_from_field_ref): + name = worker_pod_name(rank) + meta_kwargs = dict( + name=name, + labels={ + "app.kubernetes.io/instance": release_name, + "app.kubernetes.io/name": "nemo-platform", + "nccl-helm-test-worker": "true", + "iteration": str(iteration), + }, + annotations=worker_ann, + ) + if owner_refs is not None: + meta_kwargs["owner_references"] = owner_refs + meta = V1ObjectMeta(**meta_kwargs) + if leader_addr_from_field_ref: + leader_env = V1EnvVar( + name="LEADER_ADDR", + value_from=V1EnvVarSource(field_ref=V1ObjectFieldSelector(field_path="status.podIP")), + ) + else: + if not leader_ip: + raise RuntimeError("leader_ip unset for follower worker") + leader_env = V1EnvVar(name="LEADER_ADDR", value=leader_ip) + env_vars = [ + V1EnvVar(name="PYTHONUNBUFFERED", value="1"), + V1EnvVar(name="NVIDIA_VISIBLE_DEVICES", value="all"), + V1EnvVar( + name="NVIDIA_DRIVER_CAPABILITIES", + value="compute,utility", + ), + V1EnvVar(name="NCCL_DEBUG", value="INFO"), + leader_env, + V1EnvVar(name="MASTER_PORT", value=master_port), + V1EnvVar(name="NODE_RANK", value=str(rank)), + V1EnvVar(name="NUM_NODES", value=str(world_size)), + V1EnvVar(name="NPROC_PER_NODE", value=str(gpu_n)), + V1EnvVar( + name="NCCL_TEST_MIN_BANDWIDTH_MBPS", + value=os.environ.get("NCCL_TEST_MIN_BANDWIDTH_MBPS", "0"), + ), + V1EnvVar( + name="NCCL_TEST_EXPECT_IB_TRANSPORT", + value=("true" if _truthy(os.environ.get("NCCL_TEST_EXPECT_IB_TRANSPORT")) else "false"), + ), + V1EnvVar( + name="NCCL_TEST_STRICT_IB_PORT_ACTIVE", + value=("true" if _truthy(os.environ.get("NCCL_TEST_STRICT_IB_PORT_ACTIVE")) else "false"), + ), + ] + req = { + gpu_resource_key: gpu_req, + "cpu": cpu_req, + "memory": mem_req, + } + lim = { + gpu_resource_key: gpu_req, + "cpu": cpu_lim, + "memory": mem_lim, + } + if worker_net and wn_name and injection_disabled: + req[wn_name] = wn_req + lim[wn_name] = wn_lim + + container = V1Container( + name="nccl-test", + image=worker_image, + image_pull_policy="IfNotPresent", + command=["/bin/bash", "/scripts/entrypoint.sh"], + env=env_vars, + resources=V1ResourceRequirements( + requests=req, + limits=lim, + ), + volume_mounts=[ + V1VolumeMount(name="scripts", mount_path="/scripts"), + V1VolumeMount( + name="platform-config", + mount_path="/platform-config", + ), + ], + ports=[V1ContainerPort(container_port=int(master_port))], + security_context=V1SecurityContext( + run_as_user=0, + capabilities=V1Capabilities(add=["IPC_LOCK", "SYS_NICE"]), + ), + ) + return V1Pod( + api_version="v1", + kind="Pod", + metadata=meta, + spec=V1PodSpec( + restart_policy="Never", + node_selector={ + "kubernetes.io/hostname": hostname, + label_key: label_value, + }, + image_pull_secrets=image_pull_secrets or None, + containers=[container], + volumes=[ + V1Volume( + name="scripts", + config_map=V1ConfigMapVolumeSource( + name=scripts_cm, + default_mode=493, + ), + ), + V1Volume( + name="platform-config", + config_map=V1ConfigMapVolumeSource( + name=scripts_cm, + default_mode=493, + items=[ + V1KeyToPath( + key="nccl-env.sh", + path="nccl-env.sh", + mode=493, + ) + ], + ), + ), + ], + ), + ) + + try: + n0 = nodes[0].metadata.name + name0 = worker_pod_name(0) + pod0 = _make_worker_pod(0, n0, leader_addr_from_field_ref=True) + print(f"Creating pod {name0} on node {n0}") + v1.create_namespaced_pod(namespace, body=pod0) + if worker_net and wn_name and not injection_disabled: + _wait_assert_kyverno_interconnect(v1, namespace, name0, wn_name, wn_req, wn_lim) + p0 = _wait_pod_running(v1, namespace, name0, timeout_s=300) + leader_ip = p0.status.pod_ip + if not leader_ip: + raise RuntimeError(f"pod {name0} has no status.pod_ip") + print(f"Leader pod IP for MASTER_ADDR (followers): {leader_ip}") + created_names.append(name0) + + for rank in range(1, world_size): + hostname = nodes[rank].metadata.name + name = worker_pod_name(rank) + pod = _make_worker_pod(rank, hostname, leader_addr_from_field_ref=False) + print(f"Creating pod {name} on node {hostname}") + v1.create_namespaced_pod(namespace, body=pod) + if worker_net and wn_name and not injection_disabled: + _wait_assert_kyverno_interconnect(v1, namespace, name, wn_name, wn_req, wn_lim) + created_names.append(name) + + for name in created_names: + print(f"Waiting for pod {name} ...") + _wait_pod_success(v1, namespace, name, timeout_s) + print(f"Pod {name} succeeded") + + if created_names: + if print_logs: + _print_leader_success_log(v1, namespace, created_names[0]) + + print( + f"NCCL validation passed: {world_size} node(s) × {gpu_n} GPU(s)/node = " + f"{global_world} processes in all_reduce." + ) + return 0 + except Exception as e: + print(f"ERROR: {e}", file=sys.stderr) + traceback.print_exc() + _dump_logs(v1, namespace, created_names) + _dump_events(v1, namespace, created_names) + return 1 + finally: + _delete_workers(v1, namespace, created_names) + + +def main(): + iteration = int(_env("NCCL_TEST_ITERATIONS", "10")) + for i in range(iteration): + print(f"Running NCCL test iteration {i + 1} of {iteration}") + res = config_test(i, print_logs=i == iteration - 1) + if res != 0: + return res + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl b/k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl new file mode 100644 index 0000000000..38fb8da57f --- /dev/null +++ b/k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl @@ -0,0 +1,7 @@ +# {{ template "chart.description" . }} + +{{ template "chart.typeBadge" . }} + +Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. + +{{ template "chart.valuesSection" . }} diff --git a/k8s/helm/templates/NOTES.txt b/k8s/helm/templates/NOTES.txt new file mode 100644 index 0000000000..e4d688b21d --- /dev/null +++ b/k8s/helm/templates/NOTES.txt @@ -0,0 +1,133 @@ +{{- $apiServiceName := include "nmp-api.api-servicename" . -}} +{{- $apiServicePort := .Values.api.service.port -}} +{{- $namespace := .Release.Namespace -}} +================================================================================ +🚀 NVIDIA NeMo Microservices Platform + v{{ .Chart.AppVersion }} +================================================================================ + +Successfully installed {{ .Chart.Name }}-{{ .Chart.Version }}, named {{ .Release.Name }} + +{{- if .Values.ingress.enabled }} + External URL (via Ingress): + {{- if and .Values.ingress.defaultHost .Values.ingress.hosts }} + {{- range (index .Values.ingress.hosts 0).paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $.Values.ingress.defaultHost }}{{ .path }} + {{- end }} + {{- else }} + {{- range $host := .Values.ingress.hosts }} + {{- if $host.name }} + {{- range $host.paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.name }}{{ .path }} + {{- end }} + {{- else }} + {{- range $host.paths }} + Configure your ingress controller for path: {{ .path }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} +{{- else if .Values.openshiftRoute.enabled }} + External URL (via OpenShift Route): + {{- if .Values.openshiftRoute.host }} + https://{{ .Values.openshiftRoute.host }} + {{- else }} + Check the Route status for the assigned hostname: kubectl get route -n {{ $namespace }} {{ include "nemo-platform.name" . }} + {{- end }} +{{- else if .Values.httpRoute.enabled }} + External URL (via Gateway API HTTP Route): + {{- if .Values.httpRoute.hostnames }} + {{- range .Values.httpRoute.hostnames }} + https://{{ . }} + {{- end }} + {{- end }} +{{- else }} + + Internal Service URL (from within cluster): + http://{{ $apiServiceName }}.{{ $namespace }}.svc.cluster.local:{{ $apiServicePort }} + + To access from your local machine, use port-forward: + kubectl port-forward -n {{ $namespace }} svc/{{ $apiServiceName }} 8080:{{ $apiServicePort }} + + Then access the platform at: + http://localhost:8080 +{{- end }} + +{{- if not (include "nemo-platform.authEnabled" .) }} + + AUTHENTICATION IS DISABLED. This is only suitable for + development and testing. + +{{- end }} + + Install the CLI (if not already installed) and verify: + + > pip install nemo-platform + > nemo config set --base-url + > nemo auth login --base-url (If authentication is enabled) + > nemo workspaces list + + Documentation: https://docs.nvidia.com/nemo/microservices/ + +{{- if .Values.postgresql.enabled }} + + You are using the embedded PostgreSQL. It is NOT recommended for production. + For production use, enable an external PostgreSQL database: + --set postgresql.enabled=false + --set externalDatabase.host=my-external-database-host + --set externalDatabase.port=5432 + --set externalDatabase.user=nemo + --set externalDatabase.database=nemoplatform + --set externalDatabase.existingSecret=my-existing-secret + --set externalDatabase.existingSecretPasswordKey=password + +{{- if not .Values.postgresql.persistence.enabled }} + + DATABASE PERSISTENCE DISABLED. Data will be lost if the pod restarts. +{{- end }} +{{- end }} + +{{- if or .Values.multinodeNetworking.aws.enabled .Values.multinodeNetworking.azure.enabled .Values.multinodeNetworking.gcp.enabled .Values.multinodeNetworking.oci.enabled }} +{{- $ncclFullname := include "nemo-platform.fullname" . }} +{{- $ncclTestBase := printf "%s-nccl-test" ($ncclFullname | trunc 40 | trimSuffix "-") | trunc 42 }} +{{- $ncclTestConfigMap := printf "%s-cm" $ncclTestBase | trunc 63 }} + + Multi-node networking is ENABLED. Kyverno policies will inject + appropriate configurations for distributed jobs. + + {{- if .Values.multinodeNetworking.aws.enabled }} + • AWS EFA devices configured ({{ .Values.multinodeNetworking.aws.efaDevicesPerGPU }} per GPU) + {{- end }} + {{- if .Values.multinodeNetworking.azure.enabled }} + • Azure InfiniBand/RDMA configured ({{ .Values.multinodeNetworking.azure.rdmaDevicesPerGPU }} per GPU) + {{- end }} + {{- if .Values.multinodeNetworking.gcp.enabled }} + • GCP TCP-X/TCP-XO configured + {{- end }} + {{- if .Values.multinodeNetworking.oci.enabled }} + • OCI InfiniBand/SR-IOV configured ({{ .Values.multinodeNetworking.oci.rdmaDevicesPerGPU }} per GPU) + {{- end }} + + Validate multi-node networking with the chart test (NCCL allreduce across nodes): + + helm test -n {{ $namespace }} {{ .Release.Name }} + + Set ncclTest.gpusPerNode in your values file to match the GPU count per node before + running the test. After changing any ncclTest values, run helm upgrade so the chart + state matches; then run helm test. + + You may also set ncclTest.waitTimeoutSeconds (wait for each worker pod) and + ncclTest.iterations (full test loops performed to increase reliability of the test) as needed; current settings are + {{ .Values.ncclTest.waitTimeoutSeconds }} seconds and + {{ .Values.ncclTest.iterations }} iterations. + + If you increase these, raise the Helm CLI timeout as well so the run can finish: + pass --timeout to helm test (e.g. --timeout 45m) and set it high enough for your + worker waits and iteration count. + + The test runs a cleanup job afterward, but the scripts ConfigMap may still remain; + deleting it is harmless if needed: + + kubectl delete configmap {{ $ncclTestConfigMap }} -n {{ $namespace }} + +{{- end }} diff --git a/k8s/helm/templates/_config-render.tpl b/k8s/helm/templates/_config-render.tpl new file mode 100644 index 0000000000..d1eb7e7f35 --- /dev/null +++ b/k8s/helm/templates/_config-render.tpl @@ -0,0 +1 @@ +{{- tpl .Values.basePlatformConfig . -}} diff --git a/k8s/helm/templates/_helpers.tpl b/k8s/helm/templates/_helpers.tpl new file mode 100644 index 0000000000..2be28e74bc --- /dev/null +++ b/k8s/helm/templates/_helpers.tpl @@ -0,0 +1,434 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "nemo-platform.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "nemo-platform.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "nemo-platform.labels" -}} +helm.sh/chart: {{ include "nemo-platform.chart" . }} +{{ include "nemo-platform.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "nemo-platform.selectorLabels" -}} +app.kubernetes.io/name: {{ include "nemo-platform.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Calculate the config from structured and unrendered base platform, with overrides +*/}} +{{- define "nemo-platform.calculatedConfig" -}} +{{ tpl (mergeOverwrite (include "nemo-platform.unstructuredConfig" . | fromYaml) .Values.platformConfig | toYaml) . }} +{{- end -}} + +{{/* +Calculate the config from the unrendered base platform, before any overrides +*/}} +{{- define "nemo-platform.unstructuredConfig" -}} +{{ include (print $.Template.BasePath "/_config-render.tpl") . }} +{{- end -}} + +{{/* +Determine if authentication is enabled from the calculated platform config (platformConfig.auth.enabled). +Returns "true" when auth is enabled, empty string otherwise. Use with: {{- if include "nemo-platform.authEnabled" . }} +*/}} +{{- define "nemo-platform.authEnabled" -}} +{{- $config := include "nemo-platform.calculatedConfig" . | fromYaml -}} +{{- if and $config $config.auth (eq $config.auth.enabled true) -}} +true +{{- end -}} +{{- end -}} + +{{/* +Determine if the calculated platform config uses the embedded PDP provider. +Returns "true" when auth is enabled and auth.policy_decision_point_provider is "embedded". +*/}} +{{- define "nemo-platform.embeddedPdpEnabled" -}} +{{- $config := include "nemo-platform.calculatedConfig" . | fromYaml -}} +{{- if and $config $config.auth (eq $config.auth.enabled true) (eq $config.auth.policy_decision_point_provider "embedded") -}} +true +{{- end -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +*/}} +{{- define "nemo-platform.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create the name of the configmap to use +*/}} +{{- define "nemo-platform.platform-configmap" -}} +{{- printf "%s-config" (include "nemo-platform.fullname" .) }} +{{- end }} + +{{/* +Default backend service name for ingress/HTTPRoute/OpenShift Route. +When auth and Envoy proxy are enabled, returns the Envoy service name; otherwise the API service name. +Use in values (e.g. ingress.hosts[].paths[].service) with tpl so routing points to the correct backend. +*/}} +{{- define "nemo-platform.ingressBackendService" -}} +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled -}} +{{ include "nmp-envoy.servicename" . }} +{{- else -}} +{{ include "nmp-api.api-servicename" . }} +{{- end -}} +{{- end -}} + +{{/* +Default backend port for ingress/HTTPRoute/OpenShift Route. +When auth and Envoy proxy are enabled, returns the Envoy service port; otherwise the API service port. +Use in values (e.g. ingress.hosts[].paths[].port) with tpl so routing points to the correct backend. +*/}} +{{- define "nemo-platform.ingressBackendPort" -}} +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled -}} +{{ .Values.envoyProxy.service.port }} +{{- else -}} +{{ .Values.api.service.port }} +{{- end -}} +{{- end -}} + +{{/* +Bind address for in-cluster platform runner pods. +*/}} +{{- define "nemo-platform.bindHost" -}} +{{- $config := include "nemo-platform.calculatedConfig" . | fromYaml -}} +{{- dig "service" "host" "0.0.0.0" $config -}} +{{- end -}} + +{{/* +Internal API URL for pods that need to call the platform API service. +*/}} +{{- define "nemo-platform.internalBaseUrl" -}} +{{- printf "http://%s:%s" (include "nmp-api.api-servicename" .) (toString .Values.api.service.port) -}} +{{- end -}} + +{{/* +Loopback API URL for the API pod itself when embedded auth must call back into the +local process instead of the cluster Service. +*/}} +{{- define "nemo-platform.apiLoopbackBaseUrl" -}} +{{- printf "http://localhost:%s" (toString .Values.api.service.port) -}} +{{- end -}} + +{{/* +Pod annotations +*/}} +{{- define "nemo-platform.podAnnotations" -}} +checksum/config: {{ include (print $.Template.BasePath "/platform-configmap.yaml") . | sha256sum }} +{{- end -}} + +{{/* +Custom image pull secret if not defined +*/}} +{{- define "nemo-common.imagePullSecretName" -}} +{{- if .Values.existingImagePullSecret -}} +{{ printf "%s" .Values.existingImagePullSecret }} +{{- else -}} +{{ printf "%s-imagepullsecret" (include "nemo-platform.fullname" .) }} +{{- end -}} +{{- end -}} + +{{/* +Image Pull Secrets +*/}} +{{- define "nemo-common.imagepullsecrets" -}} +- name: {{ include "nemo-common.imagePullSecretName" . }} +{{- with .Values.additionalImagePullSecrets }} +{{ toYaml . }} +{{ end }} +{{- end }} + +{{/* +Embedded PostgreSQL full name (service and secret name when postgresql.enabled). +*/}} +{{- define "nemo-common.postgresql.fullname" -}} +{{- printf "%s-postgres" (include "nemo-platform.fullname" . | trunc 54 | trimSuffix "-") -}} +{{- end -}} + +{{/* +Name of the service account to use for the embedded PostgreSQL pod. +*/}} +{{- define "nemo-common.postgresql.serviceAccountName" -}} +{{- if .Values.postgresql.serviceAccount.create -}} +{{- default (printf "%s-postgres" (include "nemo-platform.fullname" .)) .Values.postgresql.serviceAccount.name }} +{{- else -}} +{{- default "default" .Values.postgresql.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +PostgreSQL Hostname +*/}} +{{- define "nemo-common.postgresql.host" -}} +{{- if .Values.postgresql.enabled -}} +{{ include "nemo-common.postgresql.fullname" . }} +{{- else -}} +{{ .Values.externalDatabase.host }} +{{- end -}} +{{- end -}} + +{{/* +nemo-common.database.port chooses between externalDatabase and the embedded postgresql port +*/}} +{{- define "nemo-common.postgresql.port" -}} +{{- if .Values.postgresql.enabled -}} +{{- printf "%d" (.Values.postgresql.service.port | int) -}} +{{- else -}} +{{- printf "%d" (.Values.externalDatabase.port | int) -}} +{{- end -}} +{{- end -}} + +{{/* +nemo-common.database.user chooses between externalDatabase and the embedded postgresql user values +*/}} +{{- define "nemo-common.postgresql.user" -}} +{{- if .Values.postgresql.enabled -}} +{{- print .Values.postgresql.auth.username -}} +{{- else -}} +{{- print .Values.externalDatabase.user -}} +{{- end -}} +{{- end -}} + +{{/* +nemo-common.database.name chooses between externalDatabase and the embedded postgresql db name values +*/}} +{{- define "nemo-common.postgresql.name" -}} +{{- if .Values.postgresql.enabled -}} +{{- print .Values.postgresql.auth.database -}} +{{- else -}} +{{- print .Values.externalDatabase.database -}} +{{- end -}} +{{- end -}} + +{{/* +nemo-common.database.secret-name chooses between externalDatabase and the embedded postgresql existing secret values +*/}} +{{- define "nemo-common.postgresql.secret-name" -}} +{{- if .Values.postgresql.enabled -}} +{{- if .Values.postgresql.auth.existingSecret -}} +{{- print .Values.postgresql.auth.existingSecret -}} +{{- else -}} +{{ include "nemo-common.postgresql.fullname" . }} +{{- end -}} +{{- else if .Values.externalDatabase.existingSecret -}} +{{- print .Values.externalDatabase.existingSecret -}} +{{- end -}} +{{- end -}} + +{{/* +nemo-common.database.password-key chooses between externalDatabase and the embedded postgresql existing secret key values +*/}} +{{- define "nemo-common.postgresql.password-key" -}} +{{- if or .Values.postgresql.enabled (not .Values.externalDatabase.existingSecret) -}} +{{- print "password" -}} +{{- else -}} +{{- print .Values.externalDatabase.existingSecretPasswordKey -}} +{{- end -}} +{{- end -}} + +{{/* +nemo-common.database.password generates a POSTGRES_DB_PASSWORD environment value if a full URI isn't used +*/}} +{{- define "nemo-common.postgresql.password" -}} +{{- if not (and .Values.externalDatabase.uriSecret .Values.externalDatabase.uriSecret.name .Values.externalDatabase.uriSecret.key) }} +- name: POSTGRES_DB_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "nemo-common.postgresql.secret-name" .}} + key: {{ include "nemo-common.postgresql.password-key" .}} +{{- end }} +{{- end -}} + +{{/* +nemo-common.otel-env generates an env var array from the top-level telemetry configuration. +Follows the specification at https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/ + +Usage: + {{ include "nemo-common.otel-env" (dict "root" $ "local" .Values) }} +*/}} +{{- define "nemo-common.otel-env" -}} +{{- $root := .root -}} +{{- $local := .local -}} +{{- $globalOtel := $root.Values.telemetry | default dict -}} +{{- $localOtel := $local.telemetry | default dict -}} + +{{- $merged := dict -}} +{{- range $k, $v := $globalOtel }} + {{- $_ := set $merged $k $v }} +{{- end }} +{{- range $k, $v := $localOtel }} + {{- $_ := set $merged $k $v }} +{{- end }} + +{{- range $key, $val := $merged }} +- name: {{ $key }} + value: {{ $val | quote }} +{{- end }} +{{- end -}} + +{{/* +nemo-common.podSecurityContext merges global podSecurityContext with component-specific podSecurityContext. +Component values override global values. + +Usage: + {{ include "nemo-common.podSecurityContext" (dict "global" .Values.podSecurityContext "local" .Values.api.podSecurityContext) }} +*/}} +{{- define "nemo-common.podSecurityContext" -}} +{{- $global := .global | default dict -}} +{{- $local := .local | default dict -}} +{{- $merged := mergeOverwrite (deepCopy $global) $local -}} +{{- if $merged }} +{{- toYaml $merged }} +{{- end }} +{{- end -}} + +{{/* +nemo-common.securityContext merges global securityContext with component-specific securityContext. +Component values override global values. + +Usage: + {{ include "nemo-common.securityContext" (dict "global" .Values.securityContext "local" .Values.api.securityContext) }} +*/}} +{{- define "nemo-common.securityContext" -}} +{{- $global := .global | default dict -}} +{{- $local := .local | default dict -}} +{{- $merged := mergeOverwrite (deepCopy $global) $local -}} +{{- if $merged }} +{{- toYaml $merged }} +{{- end }} +{{- end -}} + +{{/* +Determine if multi-node networking is enabled for any cloud provider. +Returns "true" if any cloud provider networking is enabled, empty string otherwise. + +Usage: + {{- if include "nemo-platform.multinodeNetworkingEnabled" . }} +*/}} +{{- define "nemo-platform.multinodeNetworkingEnabled" -}} +{{- if or .Values.multinodeNetworking.aws.enabled .Values.multinodeNetworking.azure.enabled .Values.multinodeNetworking.gcp.enabled .Values.multinodeNetworking.oci.enabled -}} +true +{{- end -}} +{{- end -}} + +{{/* +nemo-platform.env generates an env var array out of a dict to allow better +interleaving, easier use and default settings. It will still work if a you use +an array to render directly, but it is not recommended. It is available across +all pods. +*/}} +{{- define "nemo-platform.env" -}} +{{- if and .Values.env (kindIs "slice" .Values.env) -}} +{{- toYaml .Values.env -}} +{{- else if and .Values.env (kindIs "map" .Values.env) -}} +{{- range $k, $v := .Values.env }} +- name: {{ $k }} + {{- if kindIs "map" $v }} + valueFrom: + {{ toYaml $v.valueFrom | nindent 4 | trim }} + {{- else }} + value: {{ $v | quote }} + {{- end }} +{{- end }} +{{- end -}} +{{- end -}} + +{{/* +nemo-platform.api.env generates an env var array out of a dict to allow better +interleaving, easier use and default settings. It will still work if a you use +an array to render directly, but it is not recommended. It is available ONLY to the api pod. +*/}} +{{- define "nemo-platform.api.env" -}} +{{- if and .Values.api.env (kindIs "slice" .Values.api.env) -}} +{{- toYaml .Values.api.env -}} +{{- else if and .Values.api.env (kindIs "map" .Values.api.env) -}} +{{- range $k, $v := .Values.api.env }} +- name: {{ $k }} + {{- if kindIs "map" $v }} + valueFrom: + {{ toYaml $v.valueFrom | nindent 4 | trim }} + {{- else }} + value: {{ $v | quote }} + {{- end }} +{{- end }} +{{- end -}} +{{- end -}} + +{{/* +nemo-platform.controller.env generates an env var array out of a dict to allow better +interleaving, easier use and default settings. It will still work if a you use +an array to render directly, but it is not recommended. It is available ONLY to the controller pod. +*/}} +{{- define "nemo-platform.controller.env" -}} +{{- if and .Values.core.controller.env (kindIs "slice" .Values.core.controller.env) -}} +{{- toYaml .Values.core.controller.env -}} +{{- else if and .Values.core.controller.env (kindIs "map" .Values.core.controller.env) -}} +{{- range $k, $v := .Values.core.controller.env }} +- name: {{ $k }} + {{- if kindIs "map" $v }} + valueFrom: + {{ toYaml $v.valueFrom | nindent 4 | trim }} + {{- else }} + value: {{ $v | quote }} + {{- end }} +{{- end }} +{{- end -}} +{{- end -}} + +{{/* +nemo-platform.envoyProxy.env generates an env var array from .Values.envoyProxy.env (map of +NAME: value or NAME: valueFrom: {object}). Same format as nemo-platform.api.env. +*/}} +{{- define "nemo-platform.envoyProxy.env" -}} +{{- if and .Values.envoyProxy.env (kindIs "slice" .Values.envoyProxy.env) -}} +{{- toYaml .Values.envoyProxy.env -}} +{{- else if and .Values.envoyProxy.env (kindIs "map" .Values.envoyProxy.env) -}} +{{- range $k, $v := .Values.envoyProxy.env }} +- name: {{ $k }} + {{- if kindIs "map" $v }} + valueFrom: + {{ toYaml $v.valueFrom | nindent 4 | trim }} + {{- else }} + value: {{ $v | quote }} + {{- end }} +{{- end }} +{{- end -}} +{{- end -}} + +{{/* +Create the name of the models files auth secret (HF_TOKEN for Files service pull-through). +*/}} +{{- define "nemo-platform.modelsFilesAuthSecretName" -}} +{{- printf "%s-models-files-token" (include "nemo-platform.fullname" .) }} +{{- end }} diff --git a/k8s/helm/templates/api-env-secret.yaml b/k8s/helm/templates/api-env-secret.yaml new file mode 100644 index 0000000000..96d7386d79 --- /dev/null +++ b/k8s/helm/templates/api-env-secret.yaml @@ -0,0 +1,13 @@ +{{- if not .Values.envFromSecret }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "nemo-platform.fullname" . }}-api-env + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +type: Opaque +stringData: + # This is the default encryption key for encrypting secrets on the platform. + NMP_SECRETS_DEFAULT_ENCRYPTION_KEY: "f4NPSp39YN5oWTwZ3iDX/L3PTvEH8qFvUs1noC/jWuo=" +{{- end }} diff --git a/k8s/helm/templates/api/_helpers.tpl b/k8s/helm/templates/api/_helpers.tpl new file mode 100644 index 0000000000..2521040dd2 --- /dev/null +++ b/k8s/helm/templates/api/_helpers.tpl @@ -0,0 +1,57 @@ +{{/* +Image Definition Parsing +Favor not using a separate registry because it is confusing, but support it. +*/}} +{{- define "nmp-api.image" -}} +{{- if .Values.api.image.registry -}} +{{ .Values.api.image.registry }}/{{ .Values.api.image.repository }}:{{ default .Chart.AppVersion .Values.api.image.tag }} +{{- else -}} +{{ .Values.api.image.repository }}:{{ default .Chart.AppVersion .Values.api.image.tag }} +{{- end }} +{{- end }} + +{{/* +Create a named api service name which can be included from parent chart +*/}} +{{- define "nmp-api.api-servicename" }} +{{- printf "%s-api" ( include "nemo-platform.fullname" . | trunc 59 ) }} +{{- end }} + +{{/* +Create the name of the API service account to use +*/}} +{{- define "nmp-api.apiServiceAccountName" -}} +{{- if .Values.api.serviceAccount.create }} +{{- default (printf "%s-api" (include "nemo-platform.fullname" .)) .Values.api.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.api.serviceAccount.name }} +{{- end }} +{{- end }} + + +{{/* +Create the PVC name +*/}} +{{- define "nmp-core.persistentVolumeClaim" -}} +{{- printf "%s-core-storage" (include "nemo-platform.fullname" .) }} +{{- end }} + +{{/* +Define whether local files backend is enabled +*/}} +{{- define "nmp-core.localStorageEnabled" -}} +{{- if (include "nemo-platform.calculatedConfig" . | fromYaml).files -}} +{{- eq ( (include "nemo-platform.calculatedConfig" . | fromYaml).files.default_storage_config.type ) "local" -}} +{{- else -}} +false +{{- end -}} +{{- end -}} + +{{/* +Create the local storage path for files +*/}} +{{- define "nmp-core.localStoragePath" -}} +{{- if (include "nemo-platform.calculatedConfig" . | fromYaml).files -}} +{{ (include "nemo-platform.calculatedConfig" . | fromYaml).files.default_storage_config.path | default "" }} +{{- end -}} +{{- end }} diff --git a/k8s/helm/templates/api/api-deployment.yaml b/k8s/helm/templates/api/api-deployment.yaml new file mode 100644 index 0000000000..60a19826ad --- /dev/null +++ b/k8s/helm/templates/api/api-deployment.yaml @@ -0,0 +1,151 @@ +{{- if .Values.api.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "nmp-api.api-servicename" . }} + labels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 4 }} + annotations: + {{- with .Values.api.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if not .Values.api.autoscaling.enabled }} + replicas: {{ .Values.api.replicaCount }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + {{- with .Values.api.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- include "nemo-platform.podAnnotations" . | nindent 8 }} + labels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 8 }} + {{- with .Values.api.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + imagePullSecrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} + serviceAccountName: {{ include "nmp-api.apiServiceAccountName" . }} + securityContext: + {{- include "nemo-common.podSecurityContext" (dict "global" .Values.podSecurityContext "local" .Values.api.podSecurityContext) | nindent 8 }} + containers: + - name: nmp-api + securityContext: + {{- include "nemo-common.securityContext" (dict "global" .Values.securityContext "local" .Values.api.securityContext) | nindent 12 }} + image: {{ include "nmp-api.image" . | quote }} + imagePullPolicy: {{ .Values.api.image.pullPolicy }} + args: + - "--service-group=all" + - "--host={{ include "nemo-platform.bindHost" . }}" + - "--port={{ .Values.api.service.port }}" + {{- range .Values.api.extraArgs }} + - {{ . | quote }} + {{- end }} + envFrom: + - secretRef: + name: {{ .Values.envFromSecret | default (printf "%s-api-env" (include "nemo-platform.fullname" .)) }} + env: + - name: NMP_CONFIG_FILE_PATH + value: /etc/nmp/config.yaml + - name: NMP_BASE_URL + {{- if include "nemo-platform.embeddedPdpEnabled" . }} + value: {{ include "nemo-platform.apiLoopbackBaseUrl" . | quote }} + {{- else }} + value: {{ include "nemo-platform.internalBaseUrl" . | quote }} + {{- end }} + {{- if include "nemo-platform.embeddedPdpEnabled" . }} + - name: NMP_AUTH_POLICY_DECISION_POINT_BASE_URL + value: {{ include "nemo-platform.apiLoopbackBaseUrl" . | quote }} + {{- end }} + - name: OTEL_SERVICE_NAME + value: {{ include "nmp-api.api-servicename" . }} + {{- include "nemo-common.otel-env" (dict "root" $ "local" .Values) | indent 12 }} + {{- if ( and .Values.externalDatabase.uriSecret.name .Values.externalDatabase.uriSecret.key ) }} + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: {{ .Values.externalDatabase.uriSecret.name }} + key: {{ .Values.externalDatabase.uriSecret.key }} + {{- else }} + - name: DATABASE_DIALECT + value: "postgresql" + - name: DATABASE_USER + value: {{ include "nemo-common.postgresql.user" . }} + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "nemo-common.postgresql.secret-name" . }} + key: {{ include "nemo-common.postgresql.password-key" . }} + - name: DATABASE_NAME + value: "{{ include "nemo-common.postgresql.name" . }}" + - name: DATABASE_HOST + value: "{{ include "nemo-common.postgresql.host" . }}" + - name: DATABASE_PORT + value: "{{ include "nemo-common.postgresql.port" . }}" + {{- end }} + - name: NGC_API_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.existingSecret | default "ngc-api" }} + key: NGC_API_KEY + {{ include "nemo-platform.env" . | nindent 12 | trim }} + {{ include "nemo-platform.api.env" . | nindent 12 | trim }} + ports: + - name: http + containerPort: {{ .Values.api.service.port }} + protocol: TCP + startupProbe: + {{- toYaml .Values.api.startupProbe | nindent 12 }} + livenessProbe: + {{- toYaml .Values.api.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.api.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.api.resources | nindent 12 }} + volumeMounts: + - name: config + mountPath: /etc/nmp/config.yaml + subPath: config.yaml + - name: data-designer-tmp + mountPath: /app/data/data-designer + {{- if eq ( include "nmp-core.localStorageEnabled" . ) "true" }} + - name: files-storage + mountPath: {{ include "nmp-core.localStoragePath" . }} + {{- end }} + volumes: + - name: data-designer-tmp + emptyDir: {} + - name: config + configMap: + name: {{ include "nemo-platform.platform-configmap" . }} + {{- if eq ( include "nmp-core.localStorageEnabled" . ) "true" }} + - name: files-storage + persistentVolumeClaim: + claimName: {{ include "nmp-core.persistentVolumeClaim" . }} + {{- end }} + {{- with .Values.api.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.api.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.api.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.api.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/api/api-hpa.yaml b/k8s/helm/templates/api/api-hpa.yaml new file mode 100644 index 0000000000..169de3a507 --- /dev/null +++ b/k8s/helm/templates/api/api-hpa.yaml @@ -0,0 +1,39 @@ +{{- if .Values.api.enabled }} +{{- if .Values.api.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "nmp-api.api-servicename" . }} + labels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.api.autoscaling.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "nmp-api.api-servicename" . }} + minReplicas: {{ .Values.api.autoscaling.minReplicas }} + maxReplicas: {{ .Values.api.autoscaling.maxReplicas }} + metrics: + {{- if .Values.api.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.api.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.api.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.api.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/api/api-pdb.yaml b/k8s/helm/templates/api/api-pdb.yaml new file mode 100644 index 0000000000..76501db559 --- /dev/null +++ b/k8s/helm/templates/api/api-pdb.yaml @@ -0,0 +1,25 @@ +{{- if .Values.api.enabled }} +{{- if .Values.api.podDisruptionBudget.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "nmp-api.api-servicename" . }} + labels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.api.podDisruptionBudget.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.selectorLabels" . | nindent 6 }} + {{- if .Values.api.podDisruptionBudget.maxUnavailable }} + maxUnavailable: {{ .Values.api.podDisruptionBudget.maxUnavailable }} + {{- else }} + minAvailable: {{ .Values.api.podDisruptionBudget.minAvailable | default 1 }} + {{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/api/api-service.yaml b/k8s/helm/templates/api/api-service.yaml new file mode 100644 index 0000000000..fae36a0ad5 --- /dev/null +++ b/k8s/helm/templates/api/api-service.yaml @@ -0,0 +1,23 @@ +{{- if .Values.api.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "nmp-api.api-servicename" . }} + labels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.api.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.api.service.type }} + ports: + - port: {{ .Values.api.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/k8s/helm/templates/api/api-serviceaccount.yaml b/k8s/helm/templates/api/api-serviceaccount.yaml new file mode 100644 index 0000000000..dc634cc186 --- /dev/null +++ b/k8s/helm/templates/api/api-serviceaccount.yaml @@ -0,0 +1,16 @@ +{{- if .Values.api.enabled }} +{{- if .Values.api.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nmp-api.apiServiceAccountName" . }} + labels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.api.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.api.serviceAccount.automount }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/api/api-servicemonitor.yaml b/k8s/helm/templates/api/api-servicemonitor.yaml new file mode 100644 index 0000000000..006dc7f8bb --- /dev/null +++ b/k8s/helm/templates/api/api-servicemonitor.yaml @@ -0,0 +1,37 @@ +{{- if .Values.api.enabled }} +{{- if .Values.api.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "nmp-api.api-servicename" . }} + labels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.api.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + annotations: + {{- with .Values.api.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.api.serviceMonitor.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/component: nmp-api + {{- include "nemo-platform.labels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + {{- with .Values.api.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.api.serviceMonitor.scheme }} + scheme: {{ . }} + {{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/core/_helpers.tpl b/k8s/helm/templates/core/_helpers.tpl new file mode 100644 index 0000000000..9ae8cda7c0 --- /dev/null +++ b/k8s/helm/templates/core/_helpers.tpl @@ -0,0 +1,92 @@ +{{/* +Image Definition Parsing +Favor not using a separate registry because it is confusing, but support it. +*/}} +{{- define "nmp-core.image" -}} +{{- if .Values.core.image.registry -}} +{{ .Values.core.image.registry }}/{{ .Values.core.image.repository }}:{{ default .Chart.AppVersion .Values.core.image.tag }} +{{- else -}} +{{ .Values.core.image.repository }}:{{ default .Chart.AppVersion .Values.core.image.tag }} +{{- end }} +{{- end }} + +{{/* +Create a named core service name which can be included from parent chart +*/}} +{{- define "nmp-core.api-servicename" }} +{{- printf "%s-core" ( include "nemo-platform.fullname" . | trunc 59 ) }} +{{- end }} + +{{/* +Create a named core controller service name which can be included from parent chart +*/}} +{{- define "nmp-core.controller-servicename" }} +{{- printf "%s-core-controller" ( include "nemo-platform.fullname" . | trunc 52 ) }} +{{- end }} + +{{/* +Create a named core controller service name which can be included from parent chart +*/}} +{{- define "nmp-core.database-migrations-servicename" }} +{{- printf "%s-core-migrations" ( include "nemo-platform.fullname" .) }} +{{- end }} + +{{/* +Create the name of the API service account to use +*/}} +{{- define "nmp-core.apiServiceAccountName" -}} +{{- if .Values.core.api.serviceAccount.create }} +{{- default (printf "%s-core" (include "nemo-platform.fullname" .)) .Values.core.api.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.core.api.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create the name of the Controller service account to use +*/}} +{{- define "nmp-core.controllerServiceAccountName" -}} +{{- if .Values.core.controller.serviceAccount.create }} +{{- default (printf "%s-core-controller" (include "nemo-platform.fullname" .)) .Values.core.controller.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.core.controller.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create the name of the Jobs service account to use (for pods created by the jobs controller) +*/}} +{{- define "nmp-core.jobsServiceAccountName" -}} +{{- if .Values.core.jobs.serviceAccount.create }} +{{- default (printf "%s-jobs" (include "nemo-platform.fullname" .)) .Values.core.jobs.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.core.jobs.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create the PVC name +*/}} +{{- define "nmp-core.persistentVolumeClaim" -}} +{{- printf "%s-core-storage" (include "nemo-platform.fullname" .) }} +{{- end }} + +{{/* +Define whether local files backend is enabled +*/}} +{{- define "nmp-core.localStorageEnabled" -}} +{{- if (include "nemo-platform.calculatedConfig" . | fromYaml).files -}} +{{- eq ( (include "nemo-platform.calculatedConfig" . | fromYaml).files.default_storage_config.type ) "local" -}} +{{- else -}} +false +{{- end -}} +{{- end -}} + +{{/* +Create the local storage path for files +*/}} +{{- define "nmp-core.localStoragePath" -}} +{{- if (include "nemo-platform.calculatedConfig" . | fromYaml).files -}} +{{ (include "nemo-platform.calculatedConfig" . | fromYaml).files.default_storage_config.path | default "" }} +{{- end -}} +{{- end }} diff --git a/k8s/helm/templates/core/controller-deployment.yaml b/k8s/helm/templates/core/controller-deployment.yaml new file mode 100644 index 0000000000..7eb0271947 --- /dev/null +++ b/k8s/helm/templates/core/controller-deployment.yaml @@ -0,0 +1,133 @@ +{{- if .Values.core.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "nmp-core.controller-servicename" . }} + labels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.labels" . | nindent 4 }} + annotations: + {{- with .Values.core.controller.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + # Controller always runs with a single replica + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + {{- with .Values.core.controller.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- include "nemo-platform.podAnnotations" . | nindent 8 }} + labels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.labels" . | nindent 8 }} + {{- with .Values.core.controller.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + imagePullSecrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} + serviceAccountName: {{ include "nmp-core.controllerServiceAccountName" . }} + securityContext: + {{- include "nemo-common.podSecurityContext" (dict "global" .Values.podSecurityContext "local" .Values.core.controller.podSecurityContext) | nindent 8 }} + containers: + - name: nmp-core + securityContext: + {{- include "nemo-common.securityContext" (dict "global" .Values.securityContext "local" .Values.core.controller.securityContext) | nindent 12 }} + image: {{ include "nmp-core.image" . | quote }} + imagePullPolicy: {{ .Values.core.image.pullPolicy }} + args: + - "--controller-group=all" + - "--host={{ include "nemo-platform.bindHost" . }}" + - "--port={{ .Values.core.controller.service.port | default 8000 }}" + {{- range .Values.core.controller.extraArgs }} + - {{ . | quote }} + {{- end }} + env: + - name: NMP_CONFIG_FILE_PATH + value: /etc/nmp/config.yaml + - name: NMP_BASE_URL + value: {{ include "nemo-platform.internalBaseUrl" . | quote }} + - name: OTEL_SERVICE_NAME + value: {{ include "nmp-core.controller-servicename" . }} + {{- include "nemo-common.otel-env" (dict "root" $ "local" .Values) | indent 12 }} + {{- if ( and .Values.externalDatabase.uriSecret.name .Values.externalDatabase.uriSecret.key ) }} + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: {{ .Values.externalDatabase.uriSecret.name }} + key: {{ .Values.externalDatabase.uriSecret.key }} + {{- else }} + - name: DATABASE_DIALECT + value: "postgresql" + - name: DATABASE_USER + value: {{ include "nemo-common.postgresql.user" . }} + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "nemo-common.postgresql.secret-name" . }} + key: {{ include "nemo-common.postgresql.password-key" . }} + - name: DATABASE_NAME + value: "{{ include "nemo-common.postgresql.name" . }}" + - name: DATABASE_HOST + value: "{{ include "nemo-common.postgresql.host" . }}" + - name: DATABASE_PORT + value: "{{ include "nemo-common.postgresql.port" . }}" + {{- end }} + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{ include "nemo-platform.env" . | nindent 12 | trim }} + {{ include "nemo-platform.controller.env" . | nindent 12 | trim }} + ports: + - name: http + containerPort: {{ .Values.core.controller.service.port | default 8000 }} + protocol: TCP + {{- with .Values.core.controller.startupProbe }} + startupProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.core.controller.livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.core.controller.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.core.controller.resources | nindent 12 }} + volumeMounts: + - name: config + mountPath: /etc/nmp/config.yaml + subPath: config.yaml + volumes: + - name: config + configMap: + name: {{ include "nemo-platform.platform-configmap" . }} + {{- with .Values.core.controller.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.core.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.core.controller.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.core.controller.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/core/controller-role.yaml b/k8s/helm/templates/core/controller-role.yaml new file mode 100644 index 0000000000..b3a5f3604a --- /dev/null +++ b/k8s/helm/templates/core/controller-role.yaml @@ -0,0 +1,67 @@ +{{- if .Values.core.enabled }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "nmp-core.controllerServiceAccountName" . }} + labels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.labels" . | nindent 4 }} +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch", "create"] +- apiGroups: [""] + resources: ["pods/log"] + verbs: ["get", "list"] +- apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch"] +- apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "create", "delete"] +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list", "create", "delete"] +- apiGroups: [""] + resources: ["secrets"] + verbs: ["create", "delete"] +{{- if .Values.rbac.volcanoEnabled }} +# Volcano +- apiGroups: ["batch.volcano.sh"] + resources: ["jobs", "jobs/status"] + verbs: ["get","list","watch","update","delete","create","patch"] +- apiGroups: ["scheduling.incubator.k8s.io", "scheduling.volcano.sh"] + resources: ["queues", "queues/status", "podgroups"] + verbs: ["create", "get", "list", "watch", "update", "patch", "bind", "updateStatus", "delete"] +{{- end }} +{{- if .Values.rbac.k8sNimOperatorEnabled }} +# NIM Operator +- apiGroups: ["apps.nvidia.com"] + resources: ["nimservices", "nimservices/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["apps.nvidia.com"] + resources: ["nimcaches", "nimcaches/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "nmp-core.controllerServiceAccountName" . }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +subjects: +- kind: ServiceAccount + name: {{ include "nmp-core.controllerServiceAccountName" . }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "nmp-core.controllerServiceAccountName" . }} + apiGroup: rbac.authorization.k8s.io +{{- end }} diff --git a/k8s/helm/templates/core/controller-service-headless.yaml b/k8s/helm/templates/core/controller-service-headless.yaml new file mode 100644 index 0000000000..b8a1846bc2 --- /dev/null +++ b/k8s/helm/templates/core/controller-service-headless.yaml @@ -0,0 +1,24 @@ +{{- if .Values.core.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "nmp-core.controller-servicename" . }} + labels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.core.controller.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: ClusterIP + clusterIP: None + ports: + - port: {{ .Values.core.controller.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/k8s/helm/templates/core/controller-serviceaccount.yaml b/k8s/helm/templates/core/controller-serviceaccount.yaml new file mode 100644 index 0000000000..62575d3554 --- /dev/null +++ b/k8s/helm/templates/core/controller-serviceaccount.yaml @@ -0,0 +1,16 @@ +{{- if .Values.core.enabled }} +{{- if .Values.core.controller.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nmp-core.controllerServiceAccountName" . }} + labels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.core.controller.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.core.controller.serviceAccount.automount }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/core/controller-servicemonitor.yaml b/k8s/helm/templates/core/controller-servicemonitor.yaml new file mode 100644 index 0000000000..01a409d91c --- /dev/null +++ b/k8s/helm/templates/core/controller-servicemonitor.yaml @@ -0,0 +1,37 @@ +{{- if .Values.core.enabled }} +{{- if .Values.core.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "nmp-core.controller-servicename" . }} + labels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.core.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + annotations: + {{- with .Values.core.controller.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.core.serviceMonitor.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/component: nmp-core-controller + {{- include "nemo-platform.labels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + {{- with .Values.core.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.core.serviceMonitor.scheme }} + scheme: {{ . }} + {{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/core/jobs-serviceaccount.yaml b/k8s/helm/templates/core/jobs-serviceaccount.yaml new file mode 100644 index 0000000000..f0d668a6e6 --- /dev/null +++ b/k8s/helm/templates/core/jobs-serviceaccount.yaml @@ -0,0 +1,16 @@ +{{- if .Values.core.enabled }} +{{- if .Values.core.jobs.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nmp-core.jobsServiceAccountName" . }} + labels: + app.kubernetes.io/component: nmp-core-jobs + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.core.jobs.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.core.jobs.serviceAccount.automount }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/core/shared-pvc.yaml b/k8s/helm/templates/core/shared-pvc.yaml new file mode 100644 index 0000000000..62ee3dd800 --- /dev/null +++ b/k8s/helm/templates/core/shared-pvc.yaml @@ -0,0 +1,25 @@ +{{- if .Values.core.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "nmp-core.persistentVolumeClaim" . }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.core.storage.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.core.storage.existingPersistentVolumeName }} + volumeName: {{ .Values.core.storage.existingPersistentVolumeName }} + {{- else if .Values.core.storage.storageClass }} + storageClassName: {{ .Values.core.storage.storageClass }} + {{- end }} + {{- with .Values.core.storage.accessModes }} + accessModes: + {{- toYaml . | nindent 4 }} + {{- end }} + resources: + requests: + storage: {{ .Values.core.storage.size }} +{{- end }} diff --git a/k8s/helm/templates/httproute.yaml b/k8s/helm/templates/httproute.yaml new file mode 100644 index 0000000000..cf9687ad3e --- /dev/null +++ b/k8s/helm/templates/httproute.yaml @@ -0,0 +1,40 @@ +{{- if .Values.httpRoute.enabled -}} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ include "nemo-platform.name" . }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.httpRoute.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.httpRoute.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + parentRefs: + {{- required "A valid .Values.httpRoute.parentRefs entry is required when httpRoute.enabled is true" .Values.httpRoute.parentRefs | toYaml | nindent 4 }} + {{- with .Values.httpRoute.hostnames }} + hostnames: + {{- toYaml . | nindent 4 }} + {{- end }} + rules: + {{ range .Values.httpRoute.pathRules }} + - matches: + {{ range .matches }} + - path: + type: {{ .type }} + value: {{ .path }} + {{- end }} + {{- if $.Values.httpRoute.filters }} + filters: + {{- toYaml $.Values.httpRoute.filters | nindent 8 }} + {{- end }} + backendRefs: + {{ range .backends }} + - name: {{ tpl .service $ }} + port: {{ tpl .port $ }} + {{- end }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/imagepull-secret.yaml b/k8s/helm/templates/imagepull-secret.yaml new file mode 100644 index 0000000000..264af2cd99 --- /dev/null +++ b/k8s/helm/templates/imagepull-secret.yaml @@ -0,0 +1,15 @@ +{{- if not .Values.existingImagePullSecret }} +{{- $registry := "nvcr.io" -}} +{{- $username := "$oauthtoken" -}} +{{- $password := .Values.ngcAPIKey -}} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "nemo-common.imagePullSecretName" . }} + labels: + {{- include "nemo-platform.labels" $ | nindent 4 }} +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ printf "{\"auths\":{\"%s\":{\"username\":\"%s\",\"password\":\"%s\",\"auth\":\"%s\"}}}" $registry $username $password (printf "%s:%s" $username $password | b64enc) | b64enc | quote }} +{{- end }} diff --git a/k8s/helm/templates/ingress.yaml b/k8s/helm/templates/ingress.yaml new file mode 100644 index 0000000000..e34f34a35a --- /dev/null +++ b/k8s/helm/templates/ingress.yaml @@ -0,0 +1,56 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "nemo-platform.name" . }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + ingressClassName: {{ .Values.ingress.className }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- if .Values.ingress.defaultHost }} + - host: {{ .Values.ingress.defaultHost | quote }} + http: + paths: + {{- range (index .Values.ingress.hosts 0).paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ tpl .service $ }} + port: + number: {{ tpl .port $ | int }} + {{- end }} + {{- else }} + {{- range .Values.ingress.hosts }} + - {{- if .name }} + host: {{ .name | quote }} + {{- end }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ tpl .service $ }} + port: + number: {{ tpl .port $ | int }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/models-files-auth-secret.yaml b/k8s/helm/templates/models-files-auth-secret.yaml new file mode 100644 index 0000000000..a478a37713 --- /dev/null +++ b/k8s/helm/templates/models-files-auth-secret.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "nemo-platform.modelsFilesAuthSecretName" . }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +type: Opaque +stringData: + # Placeholder for NIMCache when pulling from Files service HF-compatible API (FILES_SERVICE / FILES_SERVICE_SFT). + # Files acts as pull-through; this value is used for auth to the platform, not to Hugging Face directly. + HF_TOKEN: "service:models" diff --git a/k8s/helm/templates/networking/kyverno-policy.yaml b/k8s/helm/templates/networking/kyverno-policy.yaml new file mode 100644 index 0000000000..0c7800e232 --- /dev/null +++ b/k8s/helm/templates/networking/kyverno-policy.yaml @@ -0,0 +1,549 @@ +{{- /* +These Kyverno policies inject cloud-specific networking and NCCL configurations +for multi-node GPU training jobs. They are triggered by the "nmp.nvidia.com/enable-multi-node-networking" +annotation set by the Volcano backend when num_nodes > 1. + +Requirements: +- Kyverno must be installed in the cluster +- Enable the appropriate cloud policy via values.yaml (multinodeNetworking.aws, azure, gcp, or oci) +- Configure device counts per GPU for your cloud provider + +The policies watch for pods created by Volcano Jobs (batch.volcano.sh) and inject: +- AWS: EFA devices, mounts, and configuration +- Azure: RDMA devices, InfiniBand configuration, NCCL topology +- GCP: TCP-X daemon, multi-network interfaces +- OCI: RDMA devices, SR-IOV networks + +Annotations used: +- nmp.nvidia.com/enable-multi-node-networking: "true" (triggers policy) +- nmp.nvidia.com/num-nodes: "" (used by OCI policy for network calculation) + +Note: Only enable ONE cloud policy per cluster deployment. +*/ -}} +{{- $enabledCount := 0 }} +{{- if .Values.multinodeNetworking.aws.enabled }}{{ $enabledCount = add1 $enabledCount }}{{- end }} +{{- if .Values.multinodeNetworking.azure.enabled }}{{ $enabledCount = add1 $enabledCount }}{{- end }} +{{- if .Values.multinodeNetworking.gcp.enabled }}{{ $enabledCount = add1 $enabledCount }}{{- end }} +{{- if .Values.multinodeNetworking.oci.enabled }}{{ $enabledCount = add1 $enabledCount }}{{- end }} +{{- if gt $enabledCount 1 }} +{{- fail "Error: Only ONE cloud provider can be enabled at a time in multinodeNetworking. Please enable only aws, azure, gcp, or oci." }} +{{- end }} +{{- if .Values.multinodeNetworking.aws.enabled }} +--- +apiVersion: kyverno.io/v1 +kind: Policy +metadata: + name: {{ include "nemo-platform.fullname" . }}-aws-efa-policy + namespace: {{ .Release.Namespace }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + admission: true + background: false + rules: + - match: + any: + - resources: + kinds: + - Pod + operations: + - CREATE + annotations: + nmp.nvidia.com/enable-multi-node-networking: "true" + mutate: + foreach: + - list: request.object.spec.containers + context: + - name: requestedGpus + variable: + jmesPath: "to_number(element.resources.requests.\"nvidia.com/gpu\" || '0')" + - name: efaNum + variable: + jmesPath: to_string(multiply(requestedGpus,`{{ .Values.multinodeNetworking.aws.efaDevicesPerGPU }}`)) + preconditions: + all: + - key: '{{ `{{ element.resources.requests."nvidia.com/gpu" || '''' }}` }}' + operator: NotEquals + value: "" + patchStrategicMerge: + spec: + volumes: + - hostPath: + path: /opt/amazon-efa-ofi + type: Directory + name: amazon-efa + - name: nv-shared-memory + emptyDir: + medium: Memory + containers: + - (name): '{{ `{{ element.name }}` }}' + resources: + limits: + vpc.amazonaws.com/efa: '{{ `{{ efaNum }}` }}' + requests: + cpu: '{{ `{{ element.resources.requests.cpu || ''100m'' }}` }}' + vpc.amazonaws.com/efa: '{{ `{{ efaNum }}` }}' + env: + # Disable huge pages. Reference: https://github.com/aws/aws-ofi-nccl/blob/master/doc/efa-env-var.md + # TLDR Disabling huge page causes minor performance hit, but it's needed to prevent fork fails due to the operating system running out of huge pages. + - name: FI_EFA_USE_HUGE_PAGE + value: "0" + volumeMounts: + - mountPath: /opt/amazon-efa-ofi + name: amazon-efa + readOnly: true + - mountPath: /dev/shm + name: nv-shared-memory + name: container-efa-volume-mounts-worker + preconditions: + all: + - key: '{{ `{{ request.object.metadata.ownerReferences[].kind || '''' }}` }}' + operator: AnyIn + value: + - PyTorchJob + - MPIJob + - Job + - RunaiJob + - JobSet + - key: '{{ `{{ request.object.metadata.ownerReferences[].apiVersion.split(@, ''/'')[0] }}` }}' + operator: AnyIn + value: + - batch.volcano.sh + - kubeflow.org + - run.ai + - jobset.x-k8s.io + - batch + skipBackgroundRequests: true +{{- end }} + +{{- if .Values.multinodeNetworking.azure.enabled }} +--- +apiVersion: kyverno.io/v1 +kind: Policy +metadata: + name: {{ include "nemo-platform.fullname" . }}-azure-rdma-policy + namespace: {{ .Release.Namespace }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + admission: true + background: false + rules: + - match: + any: + - resources: + kinds: + - Pod + operations: + - CREATE + annotations: + nmp.nvidia.com/enable-multi-node-networking: "true" + mutate: + foreach: + - list: request.object.spec.containers + context: + - name: requestedGpus + variable: + jmesPath: "to_number(element.resources.requests.\"nvidia.com/gpu\" || '0')" + - name: rdmaNum + variable: + jmesPath: to_string(multiply(requestedGpus,`{{ .Values.multinodeNetworking.azure.rdmaDevicesPerGPU }}`)) + preconditions: + all: + - key: '{{ `{{ element.resources.requests."nvidia.com/gpu" || '''' }}` }}' + operator: NotEquals + value: "" + patchStrategicMerge: + spec: + volumes: + - emptyDir: + medium: Memory + name: nv-shared-memory + - configMap: + defaultMode: 420 + name: {{ include "nemo-platform.fullname" . }}-nccl-topo + name: nccl-topo + containers: + - (name): '{{ `{{ element.name }}` }}' + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: + - IPC_LOCK + seccompProfile: + type: RuntimeDefault + resources: + limits: + rdma/{{ .Values.multinodeNetworking.azure.rdmaDeviceName }}: '{{ `{{ rdmaNum }}` }}' + requests: + rdma/{{ .Values.multinodeNetworking.azure.rdmaDeviceName }}: '{{ `{{ rdmaNum }}` }}' + env: + - name: "CUDA_DEVICE_ORDER" + value: "PCI_BUS_ID" + - name: "NCCL_IB_PCI_RELAXED_ORDERING" + value: "1" + - name: "NCCL_SOCKET_IFNAME" + value: "eth0" + - name: "NCCL_TOPO_FILE" + value: NcclTopoFilePath + - name: "OMPI_MCA_coll_hcoll_enable" + value: "0" + - name: "OMPI_MCA_orte_keep_fqdn_hostnames" + value: "t" + - name: "UCX_IB_PCI_RELAXED_ORDERING" + value: "on" + - name: "UCX_MEM_EVENTS" + value: "n" + - name: "UCX_NET_DEVICES" + value: "eth0" + - name: "UCX_TLS" + value: "tcp" + - name: "KUBE_NODE_TYPE" + value: "kube-gpu-node" + volumeMounts: + - mountPath: /dev/shm + name: nv-shared-memory + - mountPath: /etc/nccl + name: nccl-topo + readOnly: true + + name: container-rdma-nccl-configs-worker + preconditions: + all: + - key: '{{ `{{ request.object.metadata.ownerReferences[].kind || '''' }}` }}' + operator: AnyIn + value: + - PyTorchJob + - MPIJob + - Job + - RunaiJob + - JobSet + - key: '{{ `{{ request.object.metadata.ownerReferences[].apiVersion.split(@, ''/'')[0] }}` }}' + operator: AnyIn + value: + - batch.volcano.sh + - kubeflow.org + - run.ai + - jobset.x-k8s.io + - batch + skipBackgroundRequests: true +{{- end }} + +{{- if .Values.multinodeNetworking.gcp.enabled -}} +--- +apiVersion: kyverno.io/v1 +kind: Policy +metadata: + name: {{ include "nemo-platform.fullname" . }}-gcp-tcpxo-policy + namespace: {{ .Release.Namespace }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + admission: true + background: false + rules: + - match: + any: + - resources: + kinds: + - Pod + operations: + - CREATE + annotations: + nmp.nvidia.com/enable-multi-node-networking: "true" + mutate: + foreach: + - list: request.object.spec.containers + preconditions: + all: + - key: '{{ `{{ element.resources.requests."nvidia.com/gpu" || '''' }}` }}' + operator: NotEquals + value: "" + patchStrategicMerge: + annotations: + devices.gke.io/container.tcpxo-daemon: | + - path: /dev/nvidia0 + - path: /dev/nvidia1 + - path: /dev/nvidia2 + - path: /dev/nvidia3 + - path: /dev/nvidia4 + - path: /dev/nvidia5 + - path: /dev/nvidia6 + - path: /dev/nvidia7 + - path: /dev/nvidiactl + - path: /dev/nvidia-uvm + - path: /dev/dmabuf_import_helper + networking.gke.io/default-interface: eth0 + networking.gke.io/interfaces: | + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gpu-nic0"}, + {"interfaceName":"eth2","network":"gpu-nic1"}, + {"interfaceName":"eth3","network":"gpu-nic2"}, + {"interfaceName":"eth4","network":"gpu-nic3"}, + {"interfaceName":"eth5","network":"gpu-nic4"}, + {"interfaceName":"eth6","network":"gpu-nic5"}, + {"interfaceName":"eth7","network":"gpu-nic6"}, + {"interfaceName":"eth8","network":"gpu-nic7"} + ] + spec: + initContainers: + - args: + - |2 + set -ex + chmod 755 /fts/entrypoint_rxdm_container.sh + /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr --enforce_kernel_ipv6_support=false + command: + - /bin/sh + - -c + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.8 + imagePullPolicy: Always + name: tcpxo-daemon + resources: {} + restartPolicy: Always + securityContext: + capabilities: + add: + - CAP_NET_ADMIN + - CAP_NET_BIND_SERVICE + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /usr/local/nvidia + name: nvtcpxo-libraries + readOnly: true + - mountPath: /hostsysfs + name: nvtcpxo-sys + - mountPath: /hostprocsysfs + name: nvtcpxo-proc-sys + containers: + - (name): '{{ `{{ element.name }}` }}' + env: + - name: NCCL_FASTRAK_CTRL_DEV + value: "eth0" + - name: NCCL_FASTRAK_IFNAME + value: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_CROSS_NIC + value: "0" + - name: NCCL_ALGO + value: "Ring,Tree" + - name: NCCL_PROTO + value: "Simple" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: NCCL_TUNER_PLUGIN + value: "libnccl-tuner.so" + - name: NCCL_TUNER_CONFIG_PATH + value: "/usr/local/nvidia/lib64/a3plus_tuner_config.textproto" + - name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE + value: "/usr/local/nvidia/lib64/a3plus_guest_config.textproto" + - name: NCCL_DYNAMIC_CHUNK_SIZE + value: "524288" + - name: NCCL_P2P_NET_CHUNKSIZE + value: "524288" + - name: NCCL_P2P_PCI_CHUNKSIZE + value: "524288" + - name: NCCL_P2P_NVL_CHUNKSIZE + value: "1048576" + - name: NCCL_FASTRAK_NUM_FLOWS + value: "2" + - name: NCCL_FASTRAK_USE_SNAP + value: "1" + - name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS + value: "600000" + - name: NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL + value: "0" + - name: NCCL_BUFFSIZE + value: "8388608" + - name: NCCL_NET_GDR_LEVEL + value: "PIX" + - name: NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING + value: "0" + - name: NCCL_FASTRAK_USE_LLCM + value: "1" + - name: NCCL_NVLS_ENABLE + value: "0" + - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY + value: "/dev/aperture_devices" + volumeMounts: + - mountPath: /scratch + name: scratch + - mountPath: /dev/shm + name: nv-shared-memory + - mountPath: /usr/local/nvidia + name: nvtcpxo-libraries + readOnly: true + - mountPath: /dev/aperture_devices + name: nvtcpxo-aperture-devices + volumes: + - name: scratch + emptyDir: {} + - name: nv-shared-memory + emptyDir: + medium: Memory + - name: nvtcpxo-sys + hostPath: + path: /sys + - name: nvtcpxo-proc-sys + hostPath: + path: /proc/sys + - name: nvtcpxo-libraries + hostPath: + path: /home/kubernetes/bin/nvidia + - name: nvtcpxo-aperture-devices + hostPath: + path: /dev/aperture_devices + name: container-gcp-rdma-nccl-configs-worker + preconditions: + all: + - key: '{{ `{{ request.object.metadata.ownerReferences[].kind || '''' }}` }}' + operator: AnyIn + value: + - PyTorchJob + - MPIJob + - Job + - RunaiJob + - JobSet + - key: '{{ `{{ request.object.metadata.ownerReferences[].apiVersion.split(@, ''/'')[0] }}` }}' + operator: AnyIn + value: + - batch.volcano.sh + - kubeflow.org + - run.ai + - jobset.x-k8s.io + - batch + skipBackgroundRequests: true +{{- end }} + +{{- if .Values.multinodeNetworking.oci.enabled }} +--- +apiVersion: kyverno.io/v1 +kind: Policy +metadata: + name: {{ include "nemo-platform.fullname" . }}-oci-rdma-policy + namespace: {{ .Release.Namespace }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + admission: true + background: false + rules: + - match: + any: + - resources: + kinds: + - Pod + operations: + - CREATE + annotations: + nmp.nvidia.com/enable-multi-node-networking: "true" + context: + - name: numNodes + variable: + jmesPath: "to_number(request.object.metadata.annotations.\"nmp.nvidia.com/num-nodes\" || '1')" + - name: baseNetworkListString + variable: + value: {{ repeat 128 "network-operator/sriov-net," | trimSuffix "," }} + - name: operatorLength + variable: + jmesPath: length('network-operator/sriov-net,') + mutate: + foreach: + - list: request.object.spec.containers + context: + - name: requestedGpus + variable: + jmesPath: "to_number(element.resources.requests.\"nvidia.com/gpu\" || '0')" + - name: mlnxnicsNum + variable: + jmesPath: to_string(multiply(requestedGpus,`{{ .Values.multinodeNetworking.oci.rdmaDevicesPerGPU }}`)) + - name: nodeGpuProduct + variable: + jmesPath: multiply(numNodes,requestedGpus) + - name: truncateLength + variable: + jmesPath: subtract(multiply(operatorLength, to_number(mlnxnicsNum)), `1`) + - name: truncatedNetwork + variable: + jmesPath: truncate(baseNetworkListString, truncateLength) + preconditions: + all: + - key: '{{ `{{ element.resources.requests."nvidia.com/gpu" || '''' }}` }}' + operator: NotEquals + value: "" + patchStrategicMerge: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: '{{ `{{ truncatedNetwork }}` }}' + spec: + containers: + - (name): '{{ `{{ element.name }}` }}' + resources: + limits: + nvidia.com/mlnxnics: '{{ `{{ mlnxnicsNum }}` }}' + requests: + nvidia.com/mlnxnics: '{{ `{{ mlnxnicsNum }}` }}' + env: + - name: "CUDA_DEVICE_ORDER" + value: "PCI_BUS_ID" + - name: "NCCL_IB_PCI_RELAXED_ORDERING" + value: "1" + - name: "NCCL_SOCKET_IFNAME" + value: "eth0" + - name: "OMPI_MCA_coll_hcoll_enable" + value: "0" + - name: "OMPI_MCA_orte_keep_fqdn_hostnames" + value: "t" + - name: "UCX_IB_PCI_RELAXED_ORDERING" + value: "on" + - name: "UCX_MEM_EVENTS" + value: "n" + - name: "UCX_NET_DEVICES" + value: "eth0" + - name: "UCX_TLS" + value: "tcp" + - name: "KUBE_NODE_TYPE" + value: "kube-gpu-node" + - name: "NCCL_IB_TC" + value: "41" + - name: "NCCL_IB_SL" + value: "0" + - name: "NCCL_IB_QPS_PER_CONNECTION" + value: "4" + - name: "NCCL_IB_GID_INDEX" + value: "3" + - name: "NCCL_IB_SPLIT_DATA_ON_QPS" + value: "0" + volumeMounts: + - mountPath: /dev/shm + name: nv-shared-memory + volumes: + - emptyDir: + medium: Memory + name: nv-shared-memory + name: container-oci-rdma-nccl-configs-worker + preconditions: + all: + - key: '{{ `{{ request.object.metadata.ownerReferences[].kind || '''' }}` }}' + operator: AnyIn + value: + - PyTorchJob + - MPIJob + - Job + - RunaiJob + - JobSet + - key: '{{ `{{ request.object.metadata.ownerReferences[].apiVersion.split(@, ''/'')[0] }}` }}' + operator: AnyIn + value: + - batch.volcano.sh + - kubeflow.org + - run.ai + - jobset.x-k8s.io + - batch + skipBackgroundRequests: true +{{- end }} diff --git a/k8s/helm/templates/networking/nccl-topology-configmap.yaml b/k8s/helm/templates/networking/nccl-topology-configmap.yaml new file mode 100644 index 0000000000..37045febc1 --- /dev/null +++ b/k8s/helm/templates/networking/nccl-topology-configmap.yaml @@ -0,0 +1,78 @@ +{{- if .Values.multinodeNetworking.azure.enabled }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "nemo-platform.fullname" . }}-nccl-topo + namespace: {{ .Release.Namespace }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +data: + nccl-topo.xml: | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +{{- end }} diff --git a/k8s/helm/templates/ngc-api-secret.yaml b/k8s/helm/templates/ngc-api-secret.yaml new file mode 100644 index 0000000000..e432c5d6a3 --- /dev/null +++ b/k8s/helm/templates/ngc-api-secret.yaml @@ -0,0 +1,12 @@ +{{- if not .Values.existingSecret }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: ngc-api + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +type: Opaque +data: + NGC_API_KEY: {{ .Values.ngcAPIKey | b64enc }} +{{- end }} diff --git a/k8s/helm/templates/openshift-route.yaml b/k8s/helm/templates/openshift-route.yaml new file mode 100644 index 0000000000..6ca8445152 --- /dev/null +++ b/k8s/helm/templates/openshift-route.yaml @@ -0,0 +1,28 @@ +{{- if .Values.openshiftRoute.enabled -}} +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: {{ include "nemo-platform.name" . }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.openshiftRoute.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.openshiftRoute.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.openshiftRoute.host }} + host: {{ .Values.openshiftRoute.host | quote }} + {{- end }} + port: + targetPort: {{ tpl .Values.openshiftRoute.targetPort $ | int }} + to: + kind: Service + name: {{ tpl .Values.openshiftRoute.service $ }} + {{- if .Values.openshiftRoute.tls }} + tls: + {{- toYaml .Values.openshiftRoute.tls | nindent 4 }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/platform-configmap.yaml b/k8s/helm/templates/platform-configmap.yaml new file mode 100644 index 0000000000..4ab9a48a67 --- /dev/null +++ b/k8s/helm/templates/platform-configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "nemo-platform.platform-configmap" . }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +data: + config.yaml: | + {{ include "nemo-platform.calculatedConfig" . | nindent 4 }} diff --git a/k8s/helm/templates/platform-seed-job.yaml b/k8s/helm/templates/platform-seed-job.yaml new file mode 100644 index 0000000000..e8505c8b46 --- /dev/null +++ b/k8s/helm/templates/platform-seed-job.yaml @@ -0,0 +1,81 @@ +{{- if and .Values.api.enabled .Values.platformSeedJob.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "nemo-platform.fullname" . }}-platform-seed + labels: + app.kubernetes.io/component: platform-seed + {{- include "nemo-platform.labels" . | nindent 4 }} + annotations: + # Run after install and upgrade so seeding runs on fresh installs and can be re-triggered on no-op upgrade. + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "5" + # Delete previous Job before creating the new one on upgrade (allows re-trigger on no-op upgrade). + helm.sh/hook-delete-policy: before-hook-creation +spec: + ttlSecondsAfterFinished: {{ .Values.platformSeedJob.ttlSecondsAfterFinished }} + backoffLimit: {{ .Values.platformSeedJob.backoffLimit }} + activeDeadlineSeconds: {{ .Values.platformSeedJob.activeDeadlineSeconds }} + template: + metadata: + labels: + app.kubernetes.io/component: platform-seed + {{- include "nemo-platform.selectorLabels" . | nindent 8 }} + {{- with .Values.platformSeedJob.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + restartPolicy: OnFailure + imagePullSecrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} + serviceAccountName: {{ include "nmp-api.apiServiceAccountName" . }} + securityContext: + {{- include "nemo-common.podSecurityContext" (dict "global" .Values.podSecurityContext "local" .Values.platformSeedJob.podSecurityContext) | nindent 8 }} + containers: + - name: platform-seed + securityContext: + {{- include "nemo-common.securityContext" (dict "global" .Values.securityContext "local" .Values.platformSeedJob.securityContext) | nindent 12 }} + image: {{ include "nmp-api.image" . | quote }} + imagePullPolicy: {{ .Values.api.image.pullPolicy }} + command: + - nemo-platform + args: + - run + - task + - --task + - nmp.platform_seed + envFrom: + - secretRef: + name: {{ .Values.envFromSecret | default (printf "%s-api-env" (include "nemo-platform.fullname" .)) }} + env: + - name: NMP_CONFIG_FILE_PATH + value: /etc/nmp/config.yaml + - name: OTEL_SERVICE_NAME + value: {{ include "nemo-platform.fullname" . }}-platform-seed + {{ include "nemo-platform.env" . | nindent 12 | trim }} + {{- with .Values.platformSeedJob.extraEnv }} + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + - name: config + mountPath: /etc/nmp/config.yaml + subPath: config.yaml + resources: + {{- toYaml .Values.platformSeedJob.resources | nindent 12 }} + volumes: + - name: config + configMap: + name: {{ include "nemo-platform.platform-configmap" . }} + {{- with .Values.platformSeedJob.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.platformSeedJob.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.platformSeedJob.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/postgres/postgres-secret.yaml b/k8s/helm/templates/postgres/postgres-secret.yaml new file mode 100644 index 0000000000..f3ac76c12c --- /dev/null +++ b/k8s/helm/templates/postgres/postgres-secret.yaml @@ -0,0 +1,12 @@ +{{- if and .Values.postgresql.enabled (not .Values.postgresql.auth.existingSecret) }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "nemo-common.postgresql.fullname" . }} + labels: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.labels" . | nindent 4 }} +type: Opaque +stringData: + password: {{ .Values.postgresql.auth.password | quote }} +{{- end }} diff --git a/k8s/helm/templates/postgres/postgres-service.yaml b/k8s/helm/templates/postgres/postgres-service.yaml new file mode 100644 index 0000000000..3e112dcc9d --- /dev/null +++ b/k8s/helm/templates/postgres/postgres-service.yaml @@ -0,0 +1,19 @@ +{{- if .Values.postgresql.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "nemo-common.postgresql.fullname" . }} + labels: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + type: ClusterIP + ports: + - name: postgresql + port: {{ .Values.postgresql.service.port | int }} + targetPort: 5432 + protocol: TCP + selector: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/k8s/helm/templates/postgres/postgres-serviceaccount.yaml b/k8s/helm/templates/postgres/postgres-serviceaccount.yaml new file mode 100644 index 0000000000..aa13997bac --- /dev/null +++ b/k8s/helm/templates/postgres/postgres-serviceaccount.yaml @@ -0,0 +1,14 @@ +{{- if and .Values.postgresql.enabled .Values.postgresql.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nemo-common.postgresql.serviceAccountName" . }} + labels: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.labels" . | nindent 4 }} + {{- with .Values.postgresql.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.postgresql.serviceAccount.automount }} +{{- end }} diff --git a/k8s/helm/templates/postgres/postgres-statefulset.yaml b/k8s/helm/templates/postgres/postgres-statefulset.yaml new file mode 100644 index 0000000000..f53b515e30 --- /dev/null +++ b/k8s/helm/templates/postgres/postgres-statefulset.yaml @@ -0,0 +1,122 @@ +{{- if .Values.postgresql.enabled }} +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "nemo-common.postgresql.fullname" . }} + labels: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + serviceName: {{ include "nemo-common.postgresql.fullname" . }} + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.labels" . | nindent 8 }} + spec: + serviceAccountName: {{ include "nemo-common.postgresql.serviceAccountName" . }} + securityContext: + {{- include "nemo-common.podSecurityContext" (dict "global" .Values.podSecurityContext "local" .Values.postgresql.podSecurityContext) | nindent 8 }} + {{- with .Values.postgresql.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.postgresql.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.postgresql.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: postgres + image: {{ .Values.postgresql.image.repository }}:{{ .Values.postgresql.image.tag }} + imagePullPolicy: {{ .Values.postgresql.image.pullPolicy }} + securityContext: + {{- include "nemo-common.securityContext" (dict "global" .Values.securityContext "local" .Values.postgresql.securityContext) | nindent 12 }} + env: + - name: POSTGRES_USER + value: {{ include "nemo-common.postgresql.user" . }} + - name: POSTGRES_DB + value: {{ include "nemo-common.postgresql.name" . }} + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "nemo-common.postgresql.secret-name" . }} + key: {{ include "nemo-common.postgresql.password-key" . }} + ports: + - name: postgresql + containerPort: 5432 + protocol: TCP + livenessProbe: + exec: + command: + - pg_isready + - -U + - {{ include "nemo-common.postgresql.user" . }} + - -d + - {{ include "nemo-common.postgresql.name" . }} + - -p + - "5432" + - -h + - 127.0.0.1 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + exec: + command: + - pg_isready + - -U + - {{ include "nemo-common.postgresql.user" . }} + - -d + - {{ include "nemo-common.postgresql.name" . }} + - -p + - "5432" + - -h + - 127.0.0.1 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + {{- with .Values.postgresql.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + {{- if not .Values.postgresql.persistence.enabled }} + - name: data + emptyDir: {} + {{- end }} + {{- if .Values.postgresql.persistence.enabled }} + volumeClaimTemplates: + - metadata: + name: data + labels: + app.kubernetes.io/component: postgres + {{- include "nemo-platform.labels" . | nindent 10 }} + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.postgresql.persistence.size | quote }} + {{- with .Values.postgresql.persistence.storageClass }} + storageClassName: {{ . | quote }} + {{- end }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/proxy/_helpers.tpl b/k8s/helm/templates/proxy/_helpers.tpl new file mode 100644 index 0000000000..ec092087f0 --- /dev/null +++ b/k8s/helm/templates/proxy/_helpers.tpl @@ -0,0 +1,25 @@ +{{/* +Create a named Envoy service name which can be included from parent chart +*/}} +{{- define "nmp-envoy.servicename" }} +{{- printf "%s-envoy" ( include "nemo-platform.fullname" . | trunc 57 ) }} +{{- end }} + +{{/* +Labels for Envoy proxy resources (component + platform labels). +*/}} +{{- define "nmp-envoy.labels" -}} +app.kubernetes.io/component: nmp-envoy +{{ include "nemo-platform.labels" . }} +{{- end }} + +{{/* +Create the name of the Envoy service account to use +*/}} +{{- define "nmp-envoy.serviceAccountName" -}} +{{- if .Values.envoyProxy.serviceAccount.create }} +{{- default (printf "%s-envoy" (include "nemo-platform.fullname" .)) .Values.envoyProxy.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.envoyProxy.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/proxy/envoy-configmap.yaml b/k8s/helm/templates/proxy/envoy-configmap.yaml new file mode 100644 index 0000000000..575bee1016 --- /dev/null +++ b/k8s/helm/templates/proxy/envoy-configmap.yaml @@ -0,0 +1,68 @@ +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "nmp-envoy.servicename" . }} + labels: + {{- include "nmp-envoy.labels" . | nindent 4 }} +data: + envoy.yaml: | + admin: + address: + socket_address: + address: 0.0.0.0 + port_value: {{ .Values.envoyProxy.adminPort }} + static_resources: + listeners: + - name: listener_0 + address: + socket_address: + address: 0.0.0.0 + port_value: {{ .Values.envoyProxy.service.port }} + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: ingress_http + stream_idle_timeout: {{ .Values.envoyProxy.timeouts.streamIdle | quote }} + request_headers_timeout: {{ .Values.envoyProxy.timeouts.requestHeaders | quote }} + request_timeout: {{ .Values.envoyProxy.timeouts.request | quote }} + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + name: passthrough_route + internal_only_headers: + {{- range .Values.envoyProxy.trustedHeaders }} + - {{ . }} + {{- end }} + virtual_hosts: + - name: backend + domains: ["*"] + routes: + - match: + prefix: "/" + route: + cluster: backend_cluster + timeout: {{ .Values.envoyProxy.timeouts.route | quote }} + clusters: + - name: backend_cluster + type: STRICT_DNS + lb_policy: ROUND_ROBIN + connect_timeout: {{ .Values.envoyProxy.timeouts.connect | quote }} + load_assignment: + cluster_name: backend_cluster + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: {{ include "nmp-api.api-servicename" . }} + port_value: {{ .Values.api.service.port }} +{{- end }} diff --git a/k8s/helm/templates/proxy/envoy-deployment.yaml b/k8s/helm/templates/proxy/envoy-deployment.yaml new file mode 100644 index 0000000000..20b8292187 --- /dev/null +++ b/k8s/helm/templates/proxy/envoy-deployment.yaml @@ -0,0 +1,91 @@ +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "nmp-envoy.servicename" . }} + labels: + {{- include "nmp-envoy.labels" . | nindent 4 }} + annotations: + {{- with .Values.envoyProxy.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if not .Values.envoyProxy.autoscaling.enabled }} + replicas: {{ .Values.envoyProxy.replicaCount }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/component: nmp-envoy + {{- include "nemo-platform.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + checksum/config: {{ include (print $.Template.BasePath "/proxy/envoy-configmap.yaml") . | sha256sum }} + {{- with .Values.envoyProxy.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "nmp-envoy.labels" . | nindent 8 }} + {{- with .Values.envoyProxy.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "nmp-envoy.serviceAccountName" . }} + securityContext: + {{- include "nemo-common.podSecurityContext" (dict "global" .Values.podSecurityContext "local" .Values.envoyProxy.podSecurityContext) | nindent 8 }} + imagePullSecrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} + containers: + - name: envoy + securityContext: + {{- include "nemo-common.securityContext" (dict "global" .Values.securityContext "local" .Values.envoyProxy.securityContext) | nindent 12 }} + image: "{{ .Values.envoyProxy.image.repository }}:{{ .Values.envoyProxy.image.tag }}" + imagePullPolicy: {{ .Values.envoyProxy.image.pullPolicy }} + args: + - -c + - /etc/envoy/envoy.yaml + - -l + - info + {{- range .Values.envoyProxy.extraArgs }} + - {{ . | quote }} + {{- end }} + env: + {{ include "nemo-platform.envoyProxy.env" . | nindent 12 | trim }} + ports: + - name: proxy + containerPort: {{ .Values.envoyProxy.service.port }} + - name: admin + containerPort: {{ .Values.envoyProxy.adminPort }} + volumeMounts: + - name: config + mountPath: /etc/envoy + readOnly: true + startupProbe: + {{- toYaml .Values.envoyProxy.startupProbe | nindent 12 }} + livenessProbe: + {{- toYaml .Values.envoyProxy.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.envoyProxy.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.envoyProxy.resources | nindent 12 }} + volumes: + - name: config + configMap: + name: {{ include "nmp-envoy.servicename" . }} + {{- with .Values.envoyProxy.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.envoyProxy.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.envoyProxy.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.envoyProxy.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/proxy/envoy-hpa.yaml b/k8s/helm/templates/proxy/envoy-hpa.yaml new file mode 100644 index 0000000000..6eeb066036 --- /dev/null +++ b/k8s/helm/templates/proxy/envoy-hpa.yaml @@ -0,0 +1,38 @@ +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled }} +{{- if .Values.envoyProxy.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "nmp-envoy.servicename" . }} + labels: + {{- include "nmp-envoy.labels" . | nindent 4 }} + {{- with .Values.envoyProxy.autoscaling.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "nmp-envoy.servicename" . }} + minReplicas: {{ .Values.envoyProxy.autoscaling.minReplicas }} + maxReplicas: {{ .Values.envoyProxy.autoscaling.maxReplicas }} + metrics: + {{- if .Values.envoyProxy.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.envoyProxy.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.envoyProxy.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.envoyProxy.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/proxy/envoy-service.yaml b/k8s/helm/templates/proxy/envoy-service.yaml new file mode 100644 index 0000000000..1a3526d9cb --- /dev/null +++ b/k8s/helm/templates/proxy/envoy-service.yaml @@ -0,0 +1,26 @@ +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "nmp-envoy.servicename" . }} + labels: + {{- include "nmp-envoy.labels" . | nindent 4 }} + {{- with .Values.envoyProxy.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.envoyProxy.service.type }} + ports: + - name: proxy + port: {{ .Values.envoyProxy.service.port }} + targetPort: proxy + protocol: TCP + - name: admin + port: {{ .Values.envoyProxy.adminPort }} + targetPort: admin + protocol: TCP + selector: + app.kubernetes.io/component: nmp-envoy + {{- include "nemo-platform.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/k8s/helm/templates/proxy/envoy-serviceaccount.yaml b/k8s/helm/templates/proxy/envoy-serviceaccount.yaml new file mode 100644 index 0000000000..0671fde5c7 --- /dev/null +++ b/k8s/helm/templates/proxy/envoy-serviceaccount.yaml @@ -0,0 +1,15 @@ +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled }} +{{- if .Values.envoyProxy.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nmp-envoy.serviceAccountName" . }} + labels: + {{- include "nmp-envoy.labels" . | nindent 4 }} + {{- with .Values.envoyProxy.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.envoyProxy.serviceAccount.automount }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/proxy/envoy-servicemonitor.yaml b/k8s/helm/templates/proxy/envoy-servicemonitor.yaml new file mode 100644 index 0000000000..01523023fd --- /dev/null +++ b/k8s/helm/templates/proxy/envoy-servicemonitor.yaml @@ -0,0 +1,38 @@ +{{- if and (include "nemo-platform.authEnabled" .) .Values.envoyProxy.enabled }} +{{- if .Values.envoyProxy.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "nmp-envoy.servicename" . }} + labels: + {{- include "nmp-envoy.labels" . | nindent 4 }} + {{- with .Values.envoyProxy.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + annotations: + {{- with .Values.envoyProxy.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.envoyProxy.serviceMonitor.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/component: nmp-envoy + {{- include "nemo-platform.selectorLabels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + # Envoy admin /stats/prometheus endpoint for Prometheus metrics + - port: admin + path: /stats/prometheus + {{- with .Values.envoyProxy.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.envoyProxy.serviceMonitor.scheme }} + scheme: {{ . }} + {{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/templates/tests/nccl-test.yaml b/k8s/helm/templates/tests/nccl-test.yaml new file mode 100644 index 0000000000..7a7e257f12 --- /dev/null +++ b/k8s/helm/templates/tests/nccl-test.yaml @@ -0,0 +1,314 @@ +{{- /* Chart test (helm test only): resources use helm.sh/hook: test and are not applied on install/upgrade. See https://helm.sh/docs/topics/chart_tests/ */ -}} +{{- /* kyverno-policy.yaml: only one multinodeNetworking..enabled; $cloud is the active profile. */ -}} +{{- $cloud := "" }} +{{- if .Values.multinodeNetworking.aws.enabled }}{{- $cloud = "aws" }} +{{- else if .Values.multinodeNetworking.azure.enabled }}{{- $cloud = "azure" }} +{{- else if .Values.multinodeNetworking.gcp.enabled }}{{- $cloud = "gcp" }} +{{- else if .Values.multinodeNetworking.oci.enabled }}{{- $cloud = "oci" }} +{{- end }} +{{- $ncclTestKyvernoAlign := ne $cloud "" }} +{{- $v := .Values.ncclTest.validation | default dict }} +{{- /* Per-cloud derived resources and checks (see multicloud-nccl-test README + kyverno mutate rules). */ -}} +{{- $allocatableResource := "rdma/rdma_shared_device" }} +{{- if eq $cloud "aws" }}{{- $allocatableResource = "vpc.amazonaws.com/efa" }}{{- end }} +{{- if eq $cloud "azure" }}{{- $allocatableResource = printf "rdma/%s" .Values.multinodeNetworking.azure.rdmaDeviceName }}{{- end }} +{{- if eq $cloud "oci" }}{{- $allocatableResource = "nvidia.com/mlnxnics" }}{{- end }} +{{- if eq $cloud "gcp" }}{{- $allocatableResource = "" }}{{- end }} + +{{- /* expectIb: azure+oci target mlx5 / NET/IB in NCCL logs; aws=EFA/socket, gcp=TCP-X (not NET/IB). strictIb: PORT_ACTIVE only where IB is mandatory in-network-operator docs (Azure); oci/aws/gcp skip in-pod ibv strict check. */ -}} +{{- $expectIb := or (eq $cloud "azure") (eq $cloud "oci") }} +{{- $strictIb := eq $cloud "azure" }} +{{- $requireRdmaAlloc := and $ncclTestKyvernoAlign (ne $cloud "gcp") }} +{{- $gpuKey := .Values.ncclTest.gpuResourceKey | default "nvidia.com/gpu" }} +{{- $gpuN := .Values.ncclTest.gpusPerNode | default 1 | toString | int }} +{{- $gpuReq := .Values.ncclTest.gpusPerNode | default 1 | toString }} +{{- $workerNetName := "" }} +{{- $workerNetQty := "0" }} +{{- if eq $cloud "aws" }} +{{- $workerNetName = "vpc.amazonaws.com/efa" }} +{{- $workerNetQty = mul $gpuN (int .Values.multinodeNetworking.aws.efaDevicesPerGPU) | toString }} +{{- else if eq $cloud "azure" }} +{{- $workerNetName = printf "rdma/%s" .Values.multinodeNetworking.azure.rdmaDeviceName }} +{{- $workerNetQty = mul $gpuN (int .Values.multinodeNetworking.azure.rdmaDevicesPerGPU) | toString }} +{{- else if eq $cloud "oci" }} +{{- $workerNetName = "nvidia.com/mlnxnics" }} +{{- $workerNetQty = mul $gpuN (int .Values.multinodeNetworking.oci.rdmaDevicesPerGPU) | toString }} +{{- end }} +{{- $fullname := include "nemo-platform.fullname" . }} +{{- $ncclTestBase := printf "%s-nccl-test" ($fullname | trunc 40 | trimSuffix "-") | trunc 42 }} +{{- $hookDel := "before-hook-creation,hook-succeeded,hook-failed" }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ printf "%s-sa" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "-20" + helm.sh/hook-delete-policy: {{ $hookDel | quote }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ printf "%s-role" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "-20" + helm.sh/hook-delete-policy: {{ $hookDel | quote }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["create", "get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - {{ printf "%s-cm" $ncclTestBase | trunc 63 }} + verbs: ["get", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ printf "%s-rb" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "-20" + helm.sh/hook-delete-policy: {{ $hookDel | quote }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ printf "%s-role" $ncclTestBase | trunc 63 }} +subjects: + - kind: ServiceAccount + name: {{ printf "%s-sa" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ printf "%s-cr" $ncclTestBase | trunc 63 }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "-20" + helm.sh/hook-delete-policy: {{ $hookDel | quote }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ printf "%s-crb" $ncclTestBase | trunc 63 }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "-20" + helm.sh/hook-delete-policy: {{ $hookDel | quote }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ printf "%s-cr" $ncclTestBase | trunc 63 }} +subjects: + - kind: ServiceAccount + name: {{ printf "%s-sa" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-cm" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "-25" + helm.sh/hook-delete-policy: {{ $hookDel | quote }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +data: + nccl_test.py: | +{{ .Files.Get "files/nccl-test/nccl_test.py" | nindent 4 }} + entrypoint.sh: | +{{ .Files.Get "files/nccl-test/entrypoint.sh" | nindent 4 }} + orchestrator.py: | +{{ .Files.Get "files/nccl-test/orchestrator.py" | nindent 4 }} + nccl-env.sh: | +{{ .Files.Get "files/nccl-test/nccl-env.sh" | nindent 4 }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ printf "%s-job" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "0" + helm.sh/hook-delete-policy: "before-hook-creation,hook-succeeded" + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 600 + template: + metadata: + labels: + {{- include "nemo-platform.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: nccl-test-orchestrator + spec: + restartPolicy: Never + terminationGracePeriodSeconds: 0 + serviceAccountName: {{ printf "%s-sa" $ncclTestBase | trunc 63 }} + imagePullSecrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} + containers: + - name: orchestrator + image: "{{ .Values.ncclTest.orchestrator.image.repository }}:{{ .Values.ncclTest.orchestrator.image.tag }}" + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-lc"] + args: + - | + set -e + pip install --no-cache-dir 'kubernetes>=28.0.0' >/dev/null + python -u /scripts/orchestrator.py + env: + - name: PYTHONUNBUFFERED + value: "1" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: HELM_HOOK + value: test + - name: HELM_HOOK_DELETE_POLICY + value: {{ $hookDel | quote }} + - name: NCCL_TEST_KYVERNO_ENABLE_MULTI_NODE + value: {{ ternary "true" "false" $ncclTestKyvernoAlign | quote }} + - name: NCCL_TEST_ALLOW_PLATFORM_INJECTION + value: {{ ternary "true" "false" $ncclTestKyvernoAlign | quote }} + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: TEST_FULLNAME + value: {{ $ncclTestBase | quote }} + - name: SCRIPTS_CONFIGMAP_NAME + value: {{ printf "%s-cm" $ncclTestBase | trunc 63 | quote }} + - name: RELEASE_NAME + value: {{ .Release.Name | quote }} + - name: GPU_NODE_LABEL_KEY + value: {{ .Values.ncclTest.gpuNodeLabelKey | quote }} + - name: GPU_NODE_LABEL_VALUE + value: {{ .Values.ncclTest.gpuNodeLabelValue | quote }} + - name: WORKER_IMAGE + value: "{{ .Values.ncclTest.worker.image.repository }}:{{ default .Chart.AppVersion .Values.ncclTest.worker.image.tag }}" + - name: IMAGE_PULL_SECRET + value: {{ include "nemo-common.imagePullSecretName" . | quote }} + - name: MASTER_PORT + value: "29500" + - name: WAIT_TIMEOUT_SECONDS + value: {{ .Values.ncclTest.waitTimeoutSeconds | quote }} + - name: NCCL_TEST_ITERATIONS + value: {{ .Values.ncclTest.iterations | default 10 | quote }} + - name: WORKER_GPU_RESOURCE_KEY + value: {{ $gpuKey | quote }} + - name: WORKER_GPU_REQUEST + value: {{ $gpuReq | quote }} + - name: WORKER_CPU_REQUEST + value: {{ .Values.ncclTest.worker.resources.requests.cpu | quote }} + - name: WORKER_CPU_LIMIT + value: {{ .Values.ncclTest.worker.resources.limits.cpu | quote }} + - name: WORKER_MEMORY_REQUEST + value: {{ .Values.ncclTest.worker.resources.requests.memory | quote }} + - name: WORKER_MEMORY_LIMIT + value: {{ .Values.ncclTest.worker.resources.limits.memory | quote }} + - name: NCCL_TEST_MIN_BANDWIDTH_MBPS + value: {{ $v.minBandwidthMBpsAt1024MB | default 8000 | quote }} + - name: NCCL_TEST_EXPECT_IB_TRANSPORT + value: {{ ternary "true" "false" $expectIb | quote }} + - name: NCCL_TEST_STRICT_IB_PORT_ACTIVE + value: {{ ternary "true" "false" $strictIb | quote }} + - name: REQUIRE_RDMA_ALLOCATABLE + value: {{ ternary "true" "false" $requireRdmaAlloc | quote }} + - name: RDMA_ALLOCATABLE_RESOURCE + value: {{ $allocatableResource | quote }} + - name: WORKER_INTERCONNECT_RESOURCE_ENABLED + value: {{ ternary "true" "false" (ne $workerNetName "") | quote }} + - name: WORKER_INTERCONNECT_RESOURCE_NAME + value: {{ $workerNetName | quote }} + - name: WORKER_INTERCONNECT_RESOURCE_REQUEST + value: {{ $workerNetQty | quote }} + - name: WORKER_INTERCONNECT_RESOURCE_LIMIT + value: {{ $workerNetQty | quote }} + volumeMounts: + - name: scripts + mountPath: /scripts + {{- with .Values.ncclTest.orchestrator.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: scripts + configMap: + name: {{ printf "%s-cm" $ncclTestBase | trunc 63 }} + defaultMode: 420 +--- +# Runs after the orchestrator Job (hook-weight 0). Deletes the scripts ConfigMap so it is not left behind if Helm hook delete races. +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ printf "%s-clean-cm" $ncclTestBase | trunc 63 }} + namespace: {{ .Release.Namespace }} + annotations: + helm.sh/hook: test + helm.sh/hook-weight: "5" + helm.sh/hook-delete-policy: {{ $hookDel | quote }} + labels: + {{- include "nemo-platform.labels" . | nindent 4 }} +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 600 + template: + metadata: + labels: + {{- include "nemo-platform.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: nccl-test-cleanup-configmap + spec: + restartPolicy: Never + serviceAccountName: {{ printf "%s-sa" $ncclTestBase | trunc 63 }} + imagePullSecrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} + containers: + - name: kubectl + image: "{{ .Values.ncclTest.configMapCleanupJob.image.repository }}:{{ .Values.ncclTest.configMapCleanupJob.image.tag }}" + imagePullPolicy: IfNotPresent + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CM_NAME + value: {{ printf "%s-cm" $ncclTestBase | trunc 63 | quote }} + command: ["/bin/bash", "-lc"] + args: + - | + set -e + echo "Deleting ConfigMap ${CM_NAME} in ${NAMESPACE}" + kubectl delete configmap "${CM_NAME}" -n "${NAMESPACE}" --ignore-not-found + echo "ConfigMap cleanup done." diff --git a/k8s/helm/values.yaml b/k8s/helm/values.yaml new file mode 100644 index 0000000000..5683f7eec0 --- /dev/null +++ b/k8s/helm/values.yaml @@ -0,0 +1,956 @@ +# Default values for NeMo Microservices Platform Helm chart + +## Helm global configuration settings + +# -- Overrides for name and fullname templates +nameOverride: "" +fullnameOverride: "" + +# -- Your NVIDIA GPU Cloud (NGC) API key authenticates and enables pulling images from the NGC container registry. The existing secret overrides this key if you provide one to the `existingSecret` key. +ngcAPIKey: YOUR-NGC-API-KEY + +# -- Environment variables that will be applied to every deployment pod. Uses a simple key value map structure like MY_ENV_VAR: the-key and works with valueFrom as well. +env: {} + +# -- Optional. Name of an existing Kubernetes Secret to load as env vars (envFrom) for the API pod. +# When set, the chart does not create the default api-env secret; use your own secret (e.g. from Vault, sealed-secrets). +# When unset, the chart creates a default secret with the environment variable NMP_SECRETS_DEFAULT_ENCRYPTION_KEY for default installation. +# See the NeMo Platform documentation for more details on secrets encryption. +envFromSecret: "" + +# -- You can use an existing Kubernetes secret for communicating with the NGC API for downloading models. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string. +existingSecret: ngc-api + +# -- You can specify an existing Kubernetes image pull secret for pulling images from the NGC container registry. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string. +existingImagePullSecret: nvcrimagepullsecret + +# -- List of additional image pull secrets to use for pulling container images. Can be used when multiple image pull secrets are required in your environment. +additionalImagePullSecrets: {} + +# -- RBAC configuration settings for optional dependencies +rbac: + # -- Specifies whether to enable the core Controller to have RBAC permissions to Volcano for scheduling distributed jobs. + volcanoEnabled: true + # -- Specifies whether to enable the core Controller to have RBAC permissions to k8s-nim-operator's NIMService for scheduling NIMs. + k8sNimOperatorEnabled: true + +# -- Multi-node networking configuration for distributed GPU training. +# These settings control Kyverno policies that inject cloud-specific networking and NCCL configurations. +# +# Requirements: +# - Kyverno policy engine must be installed in your cluster (required for multi-node networking) +# - Kyverno is NOT included as a subchart dependency and must be installed separately +# +# To install Kyverno: +# helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace --version 3.2.0 +# +# Documentation: https://kyverno.io/docs/installation/ +# Helm chart: https://kyverno.github.io/kyverno/ +# +# Note: Only enable ONE cloud provider per cluster deployment. +multinodeNetworking: + # -- AWS-specific configuration for EFA device injection + aws: + # -- Enable AWS-specific Kyverno policy for EFA device injection + enabled: false + # -- Number of EFA devices to request per GPU (typically 1 or 4) + efaDevicesPerGPU: 1 + + # -- Azure-specific configuration for InfiniBand/RDMA + azure: + # -- Enable Azure-specific Kyverno policy for InfiniBand/RDMA configuration + enabled: false + # -- Number of RDMA devices to request per GPU + rdmaDevicesPerGPU: 1 + # -- RDMA device plugin resource name + rdmaDeviceName: "hca_shared_devices_a" + + # -- GCP-specific configuration for TCP-X/TCP-XO + gcp: + # -- Enable GCP-specific Kyverno policy for TCP-X/TCP-XO configuration + enabled: false + + # -- OCI-specific configuration for InfiniBand/SR-IOV + oci: + # -- Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration + enabled: false + # -- Number of RDMA devices (mlnxnics) to request per GPU + rdmaDevicesPerGPU: 8 + +# -- NCCL chart test (`helm test`): multi-node allreduce check. Templates use helm.sh/hook: test — they are not created on install/upgrade, only when you run helm test. +# Requires nodes labeled with gpuNodeLabelKey/gpuNodeLabelValue (default NFD / GPU operator style). +# See https://helm.sh/docs/topics/chart_tests/ +ncclTest: + # -- Node label used to discover GPU workers (must match your cluster). + gpuNodeLabelKey: nvidia.com/gpu.present + gpuNodeLabelValue: "true" + # -- Resource name for GPU capacity on worker pods (e.g. nvidia.com/gpu or a MIG device). + gpuResourceKey: nvidia.com/gpu + # -- GPUs per worker pod / per node (torch.distributed nproc_per_node). + # IMPORTANT: Set this value before testing + gpusPerNode: 1 + # -- Max seconds to wait for each worker pod to complete. + waitTimeoutSeconds: 900 + # -- How many times to run the full multinode NCCL test (orchestrator loop; env NCCL_TEST_ITERATIONS). + # Increase the test timeout on helm test if increasing this variable + iterations: 3 + validation: + # -- Minimum allreduce bandwidth (MB/s) at 1024MB message size; 0 disables the floor check in nccl_test.py. + minBandwidthMBpsAt1024MB: 8000 + orchestrator: + image: + repository: docker.io/library/python + tag: "3.12-slim" + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 512Mi + # -- Post-test hook Job (after orchestrator): kubectl deletes the scripts ConfigMap (helm.sh/hook-weight 5). + configMapCleanupJob: + image: + repository: bitnami/kubectl + tag: "latest" + worker: + image: + repository: nvcr.io/nvidia/nemo/nmp-automodel-training + tag: "" + resources: + requests: + cpu: "4" + memory: 8Gi + limits: + cpu: "8" + memory: 16Gi + +## Optional dependencies configuration. For production deployments, it is recommended to use existing installations of these dependencies. +k8s-nim-operator: + # -- Specifies whether to enable the default NIM Operator installation. To learn more, see [Install NIM Operator](https://docs.nvidia.com/nim-operator/latest/install.html). + # If you are using an existing NIM Operator installation, set this to false. + enabled: true + nfd: + nodeFeatureRules: + # -- Specifies whether to enable device ID feature rules. + deviceID: false + +# -- Local PostgreSQL configuration for the NeMo Platform. +# @default -- This object has the following default values for the PostgreSQL configuration. +postgresql: + # -- Whether to deploy the embedded PostgreSQL. If enabled, the chart deploys a single-replica PostgreSQL instance using the official Postgres image. + # It is NOT recommended to use the built-in PostgreSQL for production deployments. It is enabled in the chart by default for ease of getting started with the platform. + # If you are using an existing PostgreSQL installation, set this to false and use the "externalDatabase" configuration section. + enabled: true + image: + repository: docker.io/library/postgres + tag: "18" + pullPolicy: IfNotPresent + # -- PostgreSQL authentication configuration. + auth: + username: nemo + password: nemo + database: nemoplatform + # -- Name of an existing secret containing a "password" key (or use existingSecretPasswordKey). If set, the chart does not create a secret. + existingSecret: "" + + + # -- PostgreSQL service configuration. + service: + port: 5432 + + # -- PostgreSQL persistence configuration. + persistence: + enabled: true + size: 5Gi + # -- Storage class for the PostgreSQL PVC. If unset, the cluster default is used. + storageClass: "" + # -- Optional resource limits/requests for the PostgreSQL container. + resources: {} + # -- Optional pod security context for the PostgreSQL pod (e.g. for OpenShift SCC). + podSecurityContext: {} + # -- Optional container security context for the PostgreSQL container. + securityContext: {} + # -- Service account for the PostgreSQL pod. + # @default -- This object has the following default values for the service account configuration. + serviceAccount: + # -- Specifies whether a service account should be created for the PostgreSQL pod. + create: true + # -- Automatically mount the ServiceAccount's API credentials. + automount: true + # -- Annotations to add to the service account. + annotations: {} + # -- The name of the service account to use. If not set and create is true, a name is generated from the release fullname. + name: "" + # -- Node selector for the PostgreSQL pod. + nodeSelector: {} + # -- Affinity for the PostgreSQL pod. + affinity: {} + # -- Tolerations for the PostgreSQL pod. + tolerations: [] + +# -- External PostgreSQL configuration settings. These values are only used when postgresql.enabled is set to false. +# @default -- This object has the following default values for the external PostgreSQL configuration. +externalDatabase: + # -- External database host address. + host: localhost + # -- External database port number. + port: 5432 + # -- Database username + user: nemo + # -- Database name. + database: nemoplatform + # -- Name of an existing secret resource containing the database credentials. + existingSecret: "" + # -- Name of an existing secret key containing the database credentials. + existingSecretPasswordKey: "" + # -- URI secret configuration for external database. + # @default -- This object has the following default values for the URI secret configuration. + uriSecret: + # -- Name of the URI secret. + name: "" + # -- Key in the URI secret containing the database URI. + key: "" + +# -- Platform-wide configuration settings +# Set configuration here to apply custom, structured configuration across all services. +# Applied after the base platform config is evaluated for templates. Enables adding / overriding YAML-based elements in the evaluated platform config. +# It is usually recommended to use this config section instead of `basePlatformConfig` unless you need to use templating features. +# For example, you can set the NIM default StorageClass via models.controller.backends.k8s-nim-operator.config.default_storage_class. +# For full configuration reference, see the NeMo Platform's config reference: +# https://docs.nvidia.com/nemo/microservices/latest/set-up/config-reference.html +platformConfig: {} + +# -- Base platform configuration settings +# @default -- This object has the following default values for the base platform configuration. +basePlatformConfig: | + # -- platform is the service discovery configuration for services across the platform + platform: + # -- runtime specifies the type of runtime the platform is running on. + # Always set to 'kubernetes' for NeMo Platform when deploying with Helm. + runtime: kubernetes + + # Base URLs for various platform services + base_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}" + + # Image configuration for launching containers via the platform + image_registry: nvcr.io/nvidia/nemo + image_tag: {{ .Chart.AppVersion | quote }} + image_pull_secrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 }} + + studio: + # -- platform_base_url is the base URL used to access the platform. + # This is the URL that NeMo Studio will use in the browser to communicate with the platform backend services. + # An empty string means the Studio UI will reference its own host for API calls. + platform_base_url: "" + # -- static_files_path points at the Studio UI bundle baked into the container image + # (see docker/Dockerfile.nmp-api). The Python default resolves to the wheel's packaged + # assets, which the container image doesn't populate, so we override here. + static_files_path: "/static/studio" + + auth: + enabled: false + policy_decision_point_provider: embedded + policy_decision_point_base_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}" + policy_data_refresh_interval: 5 + bundle_cache_seconds: 5 + admin_email: "admin@example.com" + + # -- service is the common configuration for service settings on the platform + service: + host: "0.0.0.0" + port: {{ toString .Values.api.service.port }} + log_format: json + + # -- entities is the configuration specific to entity management on the platform + entities: + backend: sqlalchemy + + # -- jobs is the configuration specific to executing jobs on the platform + jobs: + # -- executor_defaults is the default configuration applied to all executor profiles + executor_defaults: + kubernetes_job: + service_account_name: {{ include "nmp-core.jobsServiceAccountName" . | quote }} + launcher_image: {{ include "nmp-core.image" . | quote }} + storage: + pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }} + volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }} + pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }} + volcano_job: + service_account_name: {{ include "nmp-core.jobsServiceAccountName" . | quote }} + launcher_image: {{ include "nmp-core.image" . | quote }} + storage: + pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }} + volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }} + pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }} + {{- if include "nemo-platform.multinodeNetworkingEnabled" . }} + # Enable multi-node networking (triggers Kyverno policies for cloud-specific configuration) + enable_multi_node_networking: true + {{- end }} + + # -- secrets is the configuration specific to storing secrets on the platform + secrets: + encryption: + current_provider: local_v1 + providers: + secret_key: + local_v1: + from_env: "NMP_SECRETS_DEFAULT_ENCRYPTION_KEY" + + # -- models is the configuration specific to model management on the platform + models: + controller: + backends: + nim_operator: + enabled: true + files_auth_secret: {{ include "nemo-platform.modelsFilesAuthSecretName" . | quote }} + + # -- inference_gateway is the configuration specific to inference request routing + inference_gateway: {} + + # -- files is the configuration specific to file management on the platform + files: + default_storage_config: + type: local + path: /vol/files + + # -- auditor is the configuration specific to the Auditor service + auditor: {} + + # -- data_designer is the configuration specific to the Data Designer service + data_designer: + model_provider_registry: + default: "mock" + providers: + - name: "mock" + endpoint: "http://localhost:8000" + + # -- customizer is the configuration specific to the Customizer service + customizer: {} + + # -- evaluator is the configuration specific to the Evaluator service + evaluator: {} + + # -- guardrails is the configuration specific to the Guardrails service + guardrails: {} + + +ingress: + # -- Specifies whether to enable the ingress. + enabled: false + # -- Annotations for the ingress resource. + annotations: {} + # -- The ingress class to use if your cluster has more than one class. + className: "" + # -- Optional default hostname. When set, one rule is generated with this host and paths from the first entry in ingress.hosts. + defaultHost: "" + # -- TLS configurations. + tls: [] + hosts: + # -- Hostname used by ingress. If blank, use path-only routing. + - name: "" + paths: + - path: / + pathType: Exact + service: '{{ include "nemo-platform.ingressBackendService" . }}' + port: '{{ include "nemo-platform.ingressBackendPort" . }}' + - path: /apis + pathType: Prefix + service: '{{ include "nemo-platform.ingressBackendService" . }}' + port: '{{ include "nemo-platform.ingressBackendPort" . }}' + - path: /studio + pathType: Prefix + service: '{{ include "nemo-platform.ingressBackendService" . }}' + port: '{{ include "nemo-platform.ingressBackendPort" . }}' + - path: /cluster-info + pathType: Exact + service: '{{ include "nemo-platform.ingressBackendService" . }}' + port: '{{ include "nemo-platform.ingressBackendPort" . }}' + - path: /status + pathType: Exact + service: '{{ include "nemo-platform.ingressBackendService" . }}' + port: '{{ include "nemo-platform.ingressBackendPort" . }}' + +httpRoute: + # -- Specifies whether to enable a Gateway API HTTP Route for the service. + enabled: false + # -- Extra labels for the HTTP Route object. + labels: {} + # -- Extra annotations for the HTTP Route object. + annotations: {} + # -- A list of Gateways to enable this route on. This is required if httpsRoute.enabled is true. + parentRefs: [] + # -- If this has a specific hostname, add the name or names here in an array. + hostnames: [] + # -- Path matches to route queries. + pathRules: + - matches: + - path: / + type: Exact + - path: /apis + type: PathPrefix + - path: /studio + type: PathPrefix + - path: /cluster-info + type: Exact + - path: /status + type: Exact + backends: + - service: '{{ include "nemo-platform.ingressBackendService" . }}' + port: '{{ include "nemo-platform.ingressBackendPort" . }}' + # -- This is a list of filters for the objects, such as CORS settings. + filters: [] + +# -- OpenShift Route (route.openshift.io/v1). Use on OpenShift to expose the API via a Route instead of Ingress. +openshiftRoute: + # -- Specifies whether to create an OpenShift Route for the API service. + enabled: false + # -- Hostname for the route. If empty, the OpenShift router may assign a default hostname. + host: "" + # -- Service name to route to. Defaults to Envoy when auth+envoy enabled, otherwise API (tpl-evaluated). + service: '{{ include "nemo-platform.ingressBackendService" . }}' + # -- Target port on the service. Defaults to Envoy or API port depending on auth (tpl-evaluated). + targetPort: '{{ include "nemo-platform.ingressBackendPort" . }}' + # -- Optional TLS configuration (termination, certificate, key, etc.). See OpenShift Route spec. + tls: {} + # -- Annotations for the route resource. + annotations: {} + # -- Labels for the route resource. + labels: {} + +# # -- OpenTelemetry configuration settings for all services. +# @default -- This object has the following default values for the OpenTelemetry configuration. +telemetry: + # -- Disable OpenTelemetry instrumentation and exporting for all services. + OTEL_SDK_DISABLED: false + # -- The OpenTelemetry grpc collector endpoint to export traces and metrics to. + OTEL_EXPORTER_OTLP_ENDPOINT: "" + # -- Whether to use an insecure connection (no TLS) to the OpenTelemetry collector endpoint. + OTEL_EXPORTER_OTLP_INSECURE: true + # -- The OpenTelemetry traces exporter to use. Options are "otlp" or "none" to disable export. + OTEL_TRACES_EXPORTER: "none" + # -- The OpenTelemetry metrics exporter to use. Options are "otlp", "prometheus" or "none" to disable export. + OTEL_METRICS_EXPORTER: "none" + # -- The OpenTelemetry traces exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set. + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: null + # -- Whether to use an insecure connection (HTTP) to the OpenTelemetry traces exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set. + OTEL_EXPORTER_OTLP_TRACES_INSECURE: true + # -- The OpenTelemetry metrics exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set. + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: null + # -- Whether to use an insecure connection (HTTP) to the OpenTelemetry metrics exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set. + OTEL_EXPORTER_OTLP_METRICS_INSECURE: true + +# -- Pod security context settings applied to all services by default. +# These can be overridden in individual service configurations. +# @default -- This object has the following default values for the pod security context. +podSecurityContext: {} + +# -- Container security context settings applied to all services by default. +# These can be overridden in individual service configurations. +# @default -- This object has the following default values for the container security context. +securityContext: {} + +# -- API configuration settings for the api deployment +# @default -- This object has the following default values for the API configuration. +api: + # -- Specifies whether to enable the api deployment. + enabled: true + + # -- Container image configuration for the api deployment. + # @default -- This object has the following default values for the image configuration. + image: + # -- The registry where the NeMo Platform image is located. + repository: nvcr.io/nvidia/nemo/nmp-api + # -- The image pull policy determining when to pull new images. + pullPolicy: IfNotPresent + # -- The image tag to use. + tag: "" + + # -- OpenTelemetry configuration overrides for the api deployment. + telemetry: {} + + # -- Number of replicas for the API service. + replicaCount: 1 + # -- Additional arguments to pass to the Platform API service + extraArgs: [] + # -- Service account configuration for the API service. + # @default -- This object has the following default values for the service account configuration. + serviceAccount: + # -- Specifies whether a service account should be created. + create: true + # -- Automatically mount a ServiceAccount's API credentials. + automount: true + # -- Annotations to add to the service account. + annotations: {} + # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template. + name: "" + # -- Annotations to add to the API service deployment. + annotations: {} + # -- Annotations to add to the API service pod. + podAnnotations: {} + # -- Labels for the API service pod. + podLabels: {} + # -- Pod-level security context settings for the API service. + # @default -- This object has the following default values for the pod security context. + podSecurityContext: + # -- The file system group ID to use for all containers. + fsGroup: 1000 + # -- Container-level security context settings for the API service. + securityContext: {} + # -- Service configuration for the API service. + # @default -- This object has the following default values for the service configuration. + service: + # -- The Kubernetes service type to create. + type: ClusterIP + # -- The port number to expose for the service. + port: 8080 + # -- Annotations for the API service. + annotations: {} + # -- Kubernetes deployment resources configuration for the API service. + resources: {} + + # -- Startup probe configuration for the api service. + # @default -- This object has the following default values for the startup probe configuration. + startupProbe: + # -- Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting). + initialDelaySeconds: 10 + # -- The HTTP GET request to use for the startup probe. + httpGet: + path: /health/ready + port: http + # -- The frequency in seconds to perform the startup probe. + periodSeconds: 15 + # -- The timeout in seconds for the startup probe. + timeoutSeconds: 5 + # -- The failure threshold for the startup probe. + failureThreshold: 24 + + # -- Liveness probe configuration for the api service. + # @default -- This object has the following default values for the liveness probe configuration. + livenessProbe: + # -- The HTTP GET request to use for the liveness probe. + httpGet: + path: /health/live + port: http + # -- The frequency in seconds to perform the liveness probe. + periodSeconds: 10 + # -- The timeout in seconds for the liveness probe. + timeoutSeconds: 5 + # -- The failure threshold for the liveness probe. + failureThreshold: 3 + + # -- Readiness probe configuration for the api service. + # @default -- This object has the following default values for the readiness probe configuration. + readinessProbe: + # -- The HTTP GET request to use for the readiness probe. + httpGet: + path: /health/ready + port: http + # -- The frequency in seconds to perform the readiness probe. + periodSeconds: 10 + # -- The timeout in seconds for the readiness probe. + timeoutSeconds: 5 + # -- The failure threshold for the readiness probe. + failureThreshold: 3 + + # -- PodDisruptionBudget configuration for the API service. + # @default -- This object has the following default values for the pod disruption budget configuration. + podDisruptionBudget: + # -- Whether to create a PodDisruptionBudget for the API pods. + enabled: false + # -- Minimum number of API pods that must remain available during voluntary disruptions. + # Only one of minAvailable or maxUnavailable may be set. + minAvailable: 1 + # -- Maximum number of API pods that can be unavailable during voluntary disruptions. + # Only one of minAvailable or maxUnavailable may be set. + # maxUnavailable: 0 + # -- Annotations for the PodDisruptionBudget. + annotations: {} + + # -- Specifies autoscaling configurations for the deployment. + autoscaling: + # -- Whether to enable horizontal pod autoscaler. + enabled: false + # -- The minimum number of replicas for the deployment. + minReplicas: 1 + # -- The maximum number of replicas for the deployment. + maxReplicas: 10 + # -- The target CPU utilization percentage. + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + # -- Annotations for the HorizontalPodAutoscaler. + annotations: {} + + # Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object} + env: {} + # -- Node selector configuration for the API service. + nodeSelector: {} + # -- Affinity configuration for the API service. + affinity: {} + # -- Tolerations configuration for the API service. + tolerations: [] + # -- Topology spread constraints for the API service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ + topologySpreadConstraints: [] + + # ServiceMonitor configuration for Prometheus Operator + serviceMonitor: + # -- Enable ServiceMonitor resources for Prometheus Operator + enabled: false + # -- Scrape interval for the ServiceMonitor + interval: "30s" + # -- Scheme to use for scraping metrics (http or https) + scheme: "http" + # -- Additional labels to add to the ServiceMonitor + labels: {} + # -- Additional annotations to add to the ServiceMonitor + annotations: {} + +# -- Platform seed Job (Helm hook: runs after install/upgrade) +# Runs the platform-seed task (guardrails configs, evaluator system entities, data designer filesets). +# Uses post-install,post-upgrade hooks so it runs on fresh installs and can be re-triggered on no-op upgrade. +# @default -- This object has the following default values for the platform seed Job configuration. +platformSeedJob: + # -- Specifies whether to enable the platform-seed Job. + enabled: true + # -- Seconds after the Job finishes (success or failure) before it is eligible for automatic deletion. + ttlSecondsAfterFinished: 86400 + # -- Number of retries before considering the Job failed. + backoffLimit: 6 + # -- Maximum time in seconds the Job can run. + activeDeadlineSeconds: 600 + # -- Pod-level security context for the platform seeding Job pod. + podSecurityContext: {} + # -- Container-level security context for the platform-seed container. + securityContext: {} + # -- Resource requests/limits for the platform-seed container. + resources: {} + # -- Extra environment variables for the platform-seed container (e.g. CONFIG_STORE_PATH, NMP_PLATFORM_SEED_*). + extraEnv: [] + # -- Node selector for the platform seeding Job pod. + nodeSelector: {} + # -- Affinity for the platform seeding Job pod. + affinity: {} + # -- Tolerations for the platform seeding Job pod. + tolerations: [] + # -- Additional labels for the platform seeding Job pod. + podLabels: {} + +# -- Core deployment configuration settings +# @default -- This object has the following default values for the core deployment configuration. +core: + # -- Specifies whether to enable the core deployment. + enabled: true + + # -- Container image configuration for the core deployment. + # @default -- This object has the following default values for the image configuration. + image: + # -- The registry where the NeMo Platform image is located. + repository: nvcr.io/nvidia/nemo/nmp-api + # -- The image pull policy determining when to pull new images. + pullPolicy: IfNotPresent + # -- The image tag to use. + tag: "" + + storage: + # -- If set, pods will mount this persistent volume for job-scoped storage + # and we will not create a new persistent volume claim. + existingPersistentVolumeName: "" + # -- Which storageClass to use when creating a new persistent volume claim. Empty string uses the cluster's default StorageClass. + storageClass: "" + # -- accessModes for the persistent volume claim. This should include `ReadWriteMany` to ensure + # multiple job pods can write to the volume concurrently. + accessModes: + - ReadWriteMany + # -- size of the persistent volume claim used for persistent storage + size: 200Gi + # -- volumePermissionsImage is the image used to set permissions on the volume + volumePermissionsImage: "busybox" + # -- Annotations to add to the persistent volume claim + annotations: {} + + # -- OpenTelemetry configuration overrides for the platform deployment. + telemetry: {} + + # -- Service account configuration for pods created by the jobs controller (Kubernetes/Volcano job pods). + # @default -- This object has the following default values for the jobs service account configuration. + jobs: + serviceAccount: + # -- Specifies whether a service account should be created for job pods. + create: true + # -- Automatically mount a ServiceAccount's API credentials. + automount: true + # -- Annotations to add to the service account. + annotations: {} + # -- The name of the service account to use. If not set and create is true, a name is generated with a '-jobs' suffix. + name: "" + + # @default -- This object has the following default values for the controller configuration. + controller: + # -- Service account configuration for the controller service. + # @default -- This object has the following default values for the service account configuration. + serviceAccount: + # -- Specifies whether a service account should be created. + create: true + # -- Automatically mount a ServiceAccount's API credentials. + automount: true + # -- Annotations to add to the service account. + annotations: {} + # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template. + name: "" + # -- Additional arguments to pass to the Core Controller service + extraArgs: [] + + # -- Service configuration for the controller service. This only configures a headless service for DNS resolution. + # @default -- This object has the following default values for the service configuration. + service: + # -- The port for the service. + port: 8080 + # -- Annotations for the headless controller service. + annotations: {} + # -- Annotations to add to the controller service deployment. + annotations: {} + # -- Annotations to add to the controller service pod. + podAnnotations: {} + # -- Labels for the controller service pod. + podLabels: {} + # -- Pod-level security context settings for the controller service. + # @default -- This object has the following default values for the pod security context. + podSecurityContext: + # -- The file system group ID to use for all containers. + fsGroup: 1000 + # -- Container-level security context settings for the controller service. + securityContext: {} + # -- Kubernetes deployment resources configuration for the controller service. + resources: {} + + # -- Startup probe configuration for the core service. + # @default -- This object has the following default values for the startup probe configuration. + startupProbe: + # -- Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting). + initialDelaySeconds: 10 + # -- The HTTP GET request to use for the startup probe. + httpGet: + path: /health/ready + port: http + # -- The frequency in seconds to perform the startup probe. + periodSeconds: 15 + # -- The timeout in seconds for the startup probe. + timeoutSeconds: 5 + # -- The failure threshold for the startup probe. + failureThreshold: 24 + + # -- Liveness probe configuration for the controller service. + # @default -- This object has the following default values for the liveness probe configuration. + livenessProbe: + # -- The HTTP GET request to use for the readiness probe. + httpGet: + path: /health/live + port: http + # -- The frequency in seconds to perform the readiness probe. + periodSeconds: 10 + # -- The timeout in seconds for the readiness probe. + timeoutSeconds: 5 + # -- The failure threshold for the readiness probe. + failureThreshold: 3 + + # -- Readiness probe configuration for the controller service. + # @default -- This object has the following default values for the readiness probe configuration. + readinessProbe: + # -- The HTTP GET request to use for the readiness probe. + httpGet: + path: /health/ready + port: http + # -- The frequency in seconds to perform the readiness probe. + periodSeconds: 10 + # -- The timeout in seconds for the readiness probe. + timeoutSeconds: 5 + # -- The failure threshold for the readiness probe. + failureThreshold: 3 + # -- Additional environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}. + env: {} + # -- Node selector configuration for the controller service. + nodeSelector: {} + # -- Affinity configuration for the controller service. + affinity: {} + # -- Tolerations configuration for the controller service. + tolerations: [] + # -- Topology spread constraints for the controller service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ + topologySpreadConstraints: [] + + # ServiceMonitor configuration for Prometheus Operator + serviceMonitor: + # -- Enable ServiceMonitor resources for Prometheus Operator + enabled: false + # -- Scrape interval for the ServiceMonitor + interval: "30s" + # -- Scheme to use for scraping metrics (http or https) + scheme: "http" + # -- Additional labels to add to the ServiceMonitor + labels: {} + # -- Additional annotations to add to the ServiceMonitor + annotations: {} + + +# -- Envoy proxy configuration settings. Resources are created only when platform config has auth.enabled: true (see platformConfig.auth.enabled). +# @default -- This object has the following default values for the envoy proxy configuration. +envoyProxy: + # -- Specifies whether to enable the Envoy proxy deployment. Rendered only when platform config has auth.enabled: true. + enabled: true + + # Headers considered internal-only + trustedHeaders: + - x-nmp-principal-id + - x-nmp-principal-email + - x-nmp-principal-groups + - x-nmp-principal-on-behalf-of + - x-nmp-principal-on-behalf-of-groups + - x-nmp-principal-on-behalf-of-email + - x-nmp-principal-filters + - x-nmp-principal-roles + - x-nmp-internal + + # Number of Envoy proxy replicas + replicaCount: 2 + + # Envoy image + image: + repository: envoyproxy/envoy + tag: v1.37.0 + pullPolicy: IfNotPresent + + # -- Service account configuration for the Envoy service. + # @default -- This object has the following default values for the service account configuration. + serviceAccount: + # -- Specifies whether a service account should be created. + create: true + # -- Automatically mount a ServiceAccount's API credentials. + automount: true + # -- Annotations to add to the service account. + annotations: {} + # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template. + name: "" + # -- Annotations to add to the Envoy service deployment. + annotations: {} + # -- Annotations to add to the Envoy service pod. + podAnnotations: {} + # -- Labels for the Envoy service pod. + podLabels: {} + # -- Pod-level security context settings for the Envoy service. + # @default -- This object has the following default values for the pod security context. + podSecurityContext: + # -- The file system group ID to use for all containers. + fsGroup: 1000 + # -- Container-level security context settings for the Envoy service. + securityContext: {} + # -- Service configuration for the Envoy service. + # @default -- This object has the following default values for the service configuration. + service: + # -- The Kubernetes service type to create. + type: ClusterIP + # -- The port number to expose for the service. + port: 8080 + # -- Annotations for the Envoy service. + annotations: {} + + # -- Envoy Admin port + adminPort: 9901 + + # -- Timeouts for proxying to long-lived streams (e.g. inference gateway). Use "0s" to disable a timeout. + # @default -- Tuned for streaming; increase or set to "0s" if requests are cut off. + timeouts: + # -- Stream idle timeout. Time with no activity before stream is closed. 0 = disabled (required for long-lived streams). + streamIdle: "0s" + # -- Time to receive full request headers. 0 = disabled. + requestHeaders: "60s" + # -- Total request timeout. 0 = disabled (required for streaming; not compatible with streaming if set). + request: "0s" + # -- Per-route timeout for the passthrough to backend. 0 = disabled. + route: "0s" + # -- Cluster connect timeout (time to establish connection to backend). + connect: "30s" + + # -- Kubernetes deployment resources configuration for the Envoy service. + resources: {} + + # -- Liveness probe for the Envoy container (admin interface /ready). + livenessProbe: + httpGet: + path: /ready + port: admin + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + # -- Readiness probe for the Envoy container (admin interface /ready). + readinessProbe: + httpGet: + path: /ready + port: admin + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + # -- Startup probe for the Envoy container (admin interface /ready). + startupProbe: + httpGet: + path: /ready + port: admin + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 12 + + # -- PodDisruptionBudget configuration for the Envoy service. + # @default -- This object has the following default values for the pod disruption budget configuration. + podDisruptionBudget: + # -- Whether to create a PodDisruptionBudget for the Envoy pods. + enabled: false + # -- Minimum number of Envoy pods that must remain available during voluntary disruptions. + # Only one of minAvailable or maxUnavailable may be set. + minAvailable: 1 + # -- Maximum number of Envoy pods that can be unavailable during voluntary disruptions. + # Only one of minAvailable or maxUnavailable may be set. + # maxUnavailable: 0 + # -- Annotations for the PodDisruptionBudget. + annotations: {} + + # -- Specifies autoscaling configurations for the deployment. + autoscaling: + # -- Whether to enable horizontal pod autoscaler. + enabled: false + # -- The minimum number of replicas for the deployment. + minReplicas: 1 + # -- The maximum number of replicas for the deployment. + maxReplicas: 10 + # -- The target CPU utilization percentage. + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + # -- Annotations for the HorizontalPodAutoscaler. + annotations: {} + + # Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object} + env: {} + # -- Node selector configuration for the Envoy pods. + nodeSelector: {} + # -- Affinity configuration for the Envoy pods. + affinity: {} + # -- Tolerations configuration for the Envoy pods. + tolerations: [] + # -- Topology spread constraints for the Envoy pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ + topologySpreadConstraints: [] + + # -- Extra arguments to append to the envoy container command. Useful for passing server flags such as concurrency. + # Example: ["--concurrency", "4"] + extraArgs: [] + + # ServiceMonitor configuration for Prometheus Operator + serviceMonitor: + # -- Enable ServiceMonitor resources for Prometheus Operator + enabled: false + # -- Scrape interval for the ServiceMonitor + interval: "30s" + # -- Scheme to use for scraping metrics (http or https) + scheme: "http" + # -- Additional labels to add to the ServiceMonitor + labels: {} + # -- Additional annotations to add to the ServiceMonitor + annotations: {} diff --git a/services/core/jobs/README.md b/services/core/jobs/README.md index 83eb3c9708..de89b969dd 100644 --- a/services/core/jobs/README.md +++ b/services/core/jobs/README.md @@ -56,7 +56,7 @@ from nemo_platform_plugin.jobs.api_factory import ( from pydantic_settings import BaseSettings class MyFunctionalMicroserviceSettings(BaseSettings): - job_image: str = Field(default="nvcr.io/nvidia/nemo-microservices/my-functional-microservice:v0.0.1") + job_image: str = Field(default="nvcr.io/nvidia/nemo/my-functional-microservice:v0.0.1") job_command: list[str] = Field(default=[]) job_args: list[str] = Field(default=["--target", "default"]) default_job_resource_cpu_request: str = Field(default="1") @@ -225,7 +225,7 @@ curl http://localhost:8080/v1/jobs/job-some-random-id "provider": "cpu", "profile": "default", "container": { - "image": "nvcr.io/nvidia/nemo-microservices/my-functional-microservice:v0.0.1", + "image": "nvcr.io/nvidia/nemo/my-functional-microservice:v0.0.1", "command": [], "args": ["--target", "default"] }, @@ -251,7 +251,7 @@ curl http://localhost:8080/v1/jobs/job-some-random-id "provider": "cpu", "profile": "default", "container": { - "image": "nvcr.io/nvidia/nemo-microservices/my-functional-microservice:v0.0.1" + "image": "nvcr.io/nvidia/nemo/my-functional-microservice:v0.0.1" }, "resources": { "requests": { @@ -412,7 +412,7 @@ docker buildx ls - Install kind: https://kind.sigs.k8s.io/docs/user/quick-start/#installation - Create a kind cluster with registry support using `bash /script/kind-with-registry.sh` - Install Skaffold: https://skaffold.dev/docs/install/ -- Update dependencies of the helm chart using `helm dep update helm/platform-ea` +- Update dependencies of the Helm chart using `helm dep update k8s/helm` - Run skaffold: `skaffold dev --default-repo=localhost:5001 --keep-running-on-failure` - If port forward doesn't work, try manually portforwarding: `kubectl port-forward service/nemo-core-api 8000:8000` diff --git a/services/guardrails/callouts/README.md b/services/guardrails/callouts/README.md index f10ab04b53..002b7dd1c8 100644 --- a/services/guardrails/callouts/README.md +++ b/services/guardrails/callouts/README.md @@ -9,7 +9,7 @@ First build local docker images required. BUILD_ARCH= docker buildx bake guardrails-docker guardrails-callout-docker guardrails-callout-mock-llm --load ``` -Create nemoguard configs similar to the ones described in [guardrails-callout-values.yaml](../../../deploy/helm/values/gcp/guardrails-callout-values.yaml). +Create nemoguard configs using the Guardrails configuration examples in the product documentation. Spin up Guardrails MS via compose. diff --git a/tools/lint/lint-helm.sh b/tools/lint/lint-helm.sh index cd7c8cc7d8..3107d451c4 100755 --- a/tools/lint/lint-helm.sh +++ b/tools/lint/lint-helm.sh @@ -2,7 +2,7 @@ set -xeo pipefail -HELM_FOLDER=${HELM_FOLDER:-deploy/helm/platform} +HELM_FOLDER=${HELM_FOLDER:-k8s/helm} HELM_RELEASE_NAME=${HELM_RELEASE_NAME:-nemo-platform} OPENSHIFT_VERSION=${OPENSHIFT_VERSION:-4.1.0} diff --git a/web/packages/studio/scripts/feature-flag-matrix.ts b/web/packages/studio/scripts/feature-flag-matrix.ts index 61fe3bf12e..86c14ca759 100644 --- a/web/packages/studio/scripts/feature-flag-matrix.ts +++ b/web/packages/studio/scripts/feature-flag-matrix.ts @@ -12,7 +12,7 @@ * 1. Studio code defaults (featureFlags.ts) * 2. Local env overrides (.env.dev.local) * 3. env_mappings.py defaults (deployment runtime injection) - * 4. Helm CI values files (deploy/helm/values/ci/*.yaml) + * 4. Optional deployment values files from HELM_DEPLOYMENT_VALUES_DIR * * Usage: * pnpm feature-flags # CLI color-coded per-flag cards @@ -34,7 +34,9 @@ const FEATURE_FLAGS_TS = path.join( ); const ENV_MAPPINGS_PY = path.join(REPO_ROOT, 'services/studio/src/nmp/studio/env_mappings.py'); const LOCAL_ENV_FILE = path.join(REPO_ROOT, 'web/packages/studio/env/.env.dev.local'); -const HELM_CI_DIR = path.join(REPO_ROOT, 'deploy/helm/values/ci'); +const HELM_DEPLOYMENT_VALUES_DIR = process.env.HELM_DEPLOYMENT_VALUES_DIR + ? path.resolve(REPO_ROOT, process.env.HELM_DEPLOYMENT_VALUES_DIR) + : ''; // --------------------------------------------------------------------------- // Helpers @@ -106,7 +108,7 @@ function parseLocalEnv(src: string): Record { } /** - * Parse per-target overrides from a Helm CI values yaml (e.g. dev-values.yaml). + * Parse per-target overrides from a Helm deployment values yaml (e.g. dev-values.yaml). * Reads platformConfig.studio.feature_flags and extracts snake_case key/value pairs. * These are the highest-priority values for deployed environments, overriding * env_mappings.py defaults for that specific target. @@ -142,13 +144,16 @@ const localEnvOverrides = fs.existsSync(LOCAL_ENV_FILE) : {}; const envMappingDefaults = safeRead(ENV_MAPPINGS_PY, parseEnvMappings, {}); -const ciFiles = fs - .readdirSync(HELM_CI_DIR) - .filter((f) => f.endsWith('.yaml')) - .sort(); +const ciFiles = + HELM_DEPLOYMENT_VALUES_DIR && fs.existsSync(HELM_DEPLOYMENT_VALUES_DIR) + ? fs + .readdirSync(HELM_DEPLOYMENT_VALUES_DIR) + .filter((f) => f.endsWith('.yaml')) + .sort() + : []; const ciOverrides: Record> = {}; for (const file of ciFiles) { - ciOverrides[file] = safeRead(path.join(HELM_CI_DIR, file), parseHelmYaml, {}); + ciOverrides[file] = safeRead(path.join(HELM_DEPLOYMENT_VALUES_DIR, file), parseHelmYaml, {}); } const allFlags = [ @@ -199,7 +204,7 @@ if (process.argv.includes('--json')) { deployments[envName(file)] = { value: resolveDeployment(flag, file), override: { - path: rel(path.join(HELM_CI_DIR, file)), + path: rel(path.join(HELM_DEPLOYMENT_VALUES_DIR, file)), value: ciOverrides[file]?.[flag] ?? null, }, }; From b23b7ed59024bbc20054a883a52f725086aa8230 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Fri, 12 Jun 2026 14:21:41 -0700 Subject: [PATCH 2/5] chore: clean up some issues with linting Signed-off-by: Brooke Storm --- k8s/helm/README.md | 6 +++--- k8s/helm/ci/01-preexisting-imagepullsecret.yaml | 1 - k8s/helm/files/nccl-test/nccl_test.py | 3 +++ k8s/helm/files/nccl-test/orchestrator.py | 3 +++ k8s/helm/values.yaml | 8 ++++---- pyproject.toml | 4 ++++ services/core/jobs/README.md | 6 +++--- 7 files changed, 20 insertions(+), 11 deletions(-) delete mode 100644 k8s/helm/ci/01-preexisting-imagepullsecret.yaml diff --git a/k8s/helm/README.md b/k8s/helm/README.md index b0620c6334..35f97c0b69 100644 --- a/k8s/helm/README.md +++ b/k8s/helm/README.md @@ -22,7 +22,7 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. | api.extraArgs | list | `[]` | Additional arguments to pass to the Platform API service | | api.image | object | This object has the following default values for the image configuration. | Container image configuration for the api deployment. | | api.image.pullPolicy | string | `"IfNotPresent"` | The image pull policy determining when to pull new images. | -| api.image.repository | string | `"nvcr.io/nvidia/nemo/nmp-api"` | The registry where the NeMo Platform image is located. | +| api.image.repository | string | `"nvcr.io/nvidia/nemo-platform/nmp-api"` | The registry where the NeMo Platform image is located. | | api.image.tag | string | `""` | The image tag to use. | | api.livenessProbe | object | This object has the following default values for the liveness probe configuration. | Liveness probe configuration for the api service. | | api.livenessProbe.failureThreshold | int | `3` | The failure threshold for the liveness probe. | @@ -111,7 +111,7 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. | core.enabled | bool | `true` | Specifies whether to enable the core deployment. | | core.image | object | This object has the following default values for the image configuration. | Container image configuration for the core deployment. | | core.image.pullPolicy | string | `"IfNotPresent"` | The image pull policy determining when to pull new images. | -| core.image.repository | string | `"nvcr.io/nvidia/nemo/nmp-api"` | The registry where the NeMo Platform image is located. | +| core.image.repository | string | `"nvcr.io/nvidia/nemo-platform/nmp-api"` | The registry where the NeMo Platform image is located. | | core.image.tag | string | `""` | The image tag to use. | | core.jobs | object | This object has the following default values for the jobs service account configuration. | Service account configuration for pods created by the jobs controller (Kubernetes/Volcano job pods). | | core.jobs.serviceAccount.annotations | object | `{}` | Annotations to add to the service account. | @@ -222,7 +222,7 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. | multinodeNetworking.oci.enabled | bool | `false` | Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration | | multinodeNetworking.oci.rdmaDevicesPerGPU | int | `8` | Number of RDMA devices (mlnxnics) to request per GPU | | nameOverride | string | `""` | Overrides for name and fullname templates | -| ncclTest | object | `{"configMapCleanupJob":{"image":{"repository":"bitnami/kubectl","tag":"latest"}},"gpuNodeLabelKey":"nvidia.com/gpu.present","gpuNodeLabelValue":"true","gpuResourceKey":"nvidia.com/gpu","gpusPerNode":1,"iterations":3,"orchestrator":{"image":{"repository":"docker.io/library/python","tag":"3.12-slim"},"resources":{"limits":{"cpu":"1","memory":"512Mi"},"requests":{"cpu":"100m","memory":"256Mi"}}},"validation":{"minBandwidthMBpsAt1024MB":8000},"waitTimeoutSeconds":900,"worker":{"image":{"repository":"nvcr.io/nvidia/nemo/nmp-automodel-training","tag":""},"resources":{"limits":{"cpu":"8","memory":"16Gi"},"requests":{"cpu":"4","memory":"8Gi"}}}}` | NCCL chart test (`helm test`): multi-node allreduce check. Templates use helm.sh/hook: test — they are not created on install/upgrade, only when you run helm test. Requires nodes labeled with gpuNodeLabelKey/gpuNodeLabelValue (default NFD / GPU operator style). See https://helm.sh/docs/topics/chart_tests/ | +| ncclTest | object | `{"configMapCleanupJob":{"image":{"repository":"bitnami/kubectl","tag":"latest"}},"gpuNodeLabelKey":"nvidia.com/gpu.present","gpuNodeLabelValue":"true","gpuResourceKey":"nvidia.com/gpu","gpusPerNode":1,"iterations":3,"orchestrator":{"image":{"repository":"docker.io/library/python","tag":"3.12-slim"},"resources":{"limits":{"cpu":"1","memory":"512Mi"},"requests":{"cpu":"100m","memory":"256Mi"}}},"validation":{"minBandwidthMBpsAt1024MB":8000},"waitTimeoutSeconds":900,"worker":{"image":{"repository":"nvcr.io/nvidia/nemo-platform/nmp-automodel-training","tag":""},"resources":{"limits":{"cpu":"8","memory":"16Gi"},"requests":{"cpu":"4","memory":"8Gi"}}}}` | NCCL chart test (`helm test`): multi-node allreduce check. Templates use helm.sh/hook: test — they are not created on install/upgrade, only when you run helm test. Requires nodes labeled with gpuNodeLabelKey/gpuNodeLabelValue (default NFD / GPU operator style). See https://helm.sh/docs/topics/chart_tests/ | | ncclTest.configMapCleanupJob | object | `{"image":{"repository":"bitnami/kubectl","tag":"latest"}}` | Post-test hook Job (after orchestrator): kubectl deletes the scripts ConfigMap (helm.sh/hook-weight 5). | | ncclTest.gpuNodeLabelKey | string | `"nvidia.com/gpu.present"` | Node label used to discover GPU workers (must match your cluster). | | ncclTest.gpuResourceKey | string | `"nvidia.com/gpu"` | Resource name for GPU capacity on worker pods (e.g. nvidia.com/gpu or a MIG device). | diff --git a/k8s/helm/ci/01-preexisting-imagepullsecret.yaml b/k8s/helm/ci/01-preexisting-imagepullsecret.yaml deleted file mode 100644 index 334f0697fc..0000000000 --- a/k8s/helm/ci/01-preexisting-imagepullsecret.yaml +++ /dev/null @@ -1 +0,0 @@ -existingImagePullSecret: "nvcrimagepullsecret" diff --git a/k8s/helm/files/nccl-test/nccl_test.py b/k8s/helm/files/nccl-test/nccl_test.py index 8253b2f1e6..65d9cbdc51 100644 --- a/k8s/helm/files/nccl-test/nccl_test.py +++ b/k8s/helm/files/nccl-test/nccl_test.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + import os import sys import time diff --git a/k8s/helm/files/nccl-test/orchestrator.py b/k8s/helm/files/nccl-test/orchestrator.py index 156d4f9614..49e54bd88a 100644 --- a/k8s/helm/files/nccl-test/orchestrator.py +++ b/k8s/helm/files/nccl-test/orchestrator.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + """Discover GPU nodes, run one NCCL worker pod per node, then delete workers.""" import os diff --git a/k8s/helm/values.yaml b/k8s/helm/values.yaml index 5683f7eec0..3167f8b36b 100644 --- a/k8s/helm/values.yaml +++ b/k8s/helm/values.yaml @@ -115,7 +115,7 @@ ncclTest: tag: "latest" worker: image: - repository: nvcr.io/nvidia/nemo/nmp-automodel-training + repository: nvcr.io/nvidia/nemo-platform/nmp-automodel-training tag: "" resources: requests: @@ -234,7 +234,7 @@ basePlatformConfig: | base_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}" # Image configuration for launching containers via the platform - image_registry: nvcr.io/nvidia/nemo + image_registry: nvcr.io/nvidia/nemo-platform image_tag: {{ .Chart.AppVersion | quote }} image_pull_secrets: {{ include "nemo-common.imagepullsecrets" . | nindent 8 }} @@ -462,7 +462,7 @@ api: # @default -- This object has the following default values for the image configuration. image: # -- The registry where the NeMo Platform image is located. - repository: nvcr.io/nvidia/nemo/nmp-api + repository: nvcr.io/nvidia/nemo-platform/nmp-api # -- The image pull policy determining when to pull new images. pullPolicy: IfNotPresent # -- The image tag to use. @@ -647,7 +647,7 @@ core: # @default -- This object has the following default values for the image configuration. image: # -- The registry where the NeMo Platform image is located. - repository: nvcr.io/nvidia/nemo/nmp-api + repository: nvcr.io/nvidia/nemo-platform/nmp-api # -- The image pull policy determining when to pull new images. pullPolicy: IfNotPresent # -- The image tag to use. diff --git a/pyproject.toml b/pyproject.toml index 27458e38bc..851293f45c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -560,6 +560,10 @@ exclude = [ "services/automodel/src/nmp/automodel/tasks/training/backends/", "services/automodel/src/nmp/automodel/tasks/training/utils.py", "services/unsloth/src/nmp/unsloth/tasks/training/backends/", + # Helm chart test payloads run inside purpose-built Kubernetes containers. + # The NCCL worker imports torch at runtime, which is not part of the root + # workspace type-checking environment. + "k8s/helm/files/nccl-test/", "./services/guardrails/", diff --git a/services/core/jobs/README.md b/services/core/jobs/README.md index de89b969dd..88557ca6d6 100644 --- a/services/core/jobs/README.md +++ b/services/core/jobs/README.md @@ -56,7 +56,7 @@ from nemo_platform_plugin.jobs.api_factory import ( from pydantic_settings import BaseSettings class MyFunctionalMicroserviceSettings(BaseSettings): - job_image: str = Field(default="nvcr.io/nvidia/nemo/my-functional-microservice:v0.0.1") + job_image: str = Field(default="nvcr.io/nvidia/nemo-platform/my-functional-microservice:v0.0.1") job_command: list[str] = Field(default=[]) job_args: list[str] = Field(default=["--target", "default"]) default_job_resource_cpu_request: str = Field(default="1") @@ -225,7 +225,7 @@ curl http://localhost:8080/v1/jobs/job-some-random-id "provider": "cpu", "profile": "default", "container": { - "image": "nvcr.io/nvidia/nemo/my-functional-microservice:v0.0.1", + "image": "nvcr.io/nvidia/nemo-platform/my-functional-microservice:v0.0.1", "command": [], "args": ["--target", "default"] }, @@ -251,7 +251,7 @@ curl http://localhost:8080/v1/jobs/job-some-random-id "provider": "cpu", "profile": "default", "container": { - "image": "nvcr.io/nvidia/nemo/my-functional-microservice:v0.0.1" + "image": "nvcr.io/nvidia/nemo-platform/my-functional-microservice:v0.0.1" }, "resources": { "requests": { From dd8f4f3a2236d24eb6cbe3b4f1771711311d8490 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Fri, 12 Jun 2026 14:33:31 -0700 Subject: [PATCH 3/5] chore: adjust versions Signed-off-by: Brooke Storm --- k8s/helm/Chart.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/helm/Chart.yaml b/k8s/helm/Chart.yaml index 3266ff779c..e4f919bef0 100644 --- a/k8s/helm/Chart.yaml +++ b/k8s/helm/Chart.yaml @@ -6,9 +6,9 @@ maintainers: description: NeMo Platform Helm Chart type: application # version is the version of the Helm chart, and can be different from the appVersion. -version: 2.1.0 +version: 0.1.0 # appVersion is the version of the application deployed in the chart. -appVersion: "26.3.0" +appVersion: "0.2.0" home: https://nvidia.com dependencies: ## NMP dependencies From cefd558008e3e508745637423b842acafc194faa Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Fri, 12 Jun 2026 14:36:24 -0700 Subject: [PATCH 4/5] fix: correct test template Signed-off-by: Brooke Storm --- k8s/helm/ci/11-setting-env.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/helm/ci/11-setting-env.yaml b/k8s/helm/ci/11-setting-env.yaml index 4cd71231cf..e2093c3cd8 100644 --- a/k8s/helm/ci/11-setting-env.yaml +++ b/k8s/helm/ci/11-setting-env.yaml @@ -2,8 +2,8 @@ env: MY_ENV_VAR: valueFrom: secretKeyRef: - key: foo - value: bar + name: foo + key: bar ANOTHER_VAR: bloop THIS_WILL_BE_QUOTED_BY_TEMPLATE: true From 02cae3ed2d1f64bb2c88ce1ac2e724d5263b44b9 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Fri, 12 Jun 2026 15:45:58 -0700 Subject: [PATCH 5/5] feat: implement auto-generated KEK secret upon install only Signed-off-by: Brooke Storm --- k8s/helm/README.md | 43 +++- k8s/helm/ci/14-secretfromenv.yaml | 3 +- .../20-explicit-default-encryption-key.yaml | 8 + .../nemo-helm-readme.md.gotmpl | 18 ++ k8s/helm/templates/_helpers.tpl | 55 +++++ .../templates/api-env-secret-generator.yaml | 220 ++++++++++++++++++ .../api-env-secret-upgrade-check.yaml | 11 + k8s/helm/templates/api-env-secret.yaml | 15 +- k8s/helm/templates/api/api-deployment.yaml | 2 +- k8s/helm/templates/platform-seed-job.yaml | 2 +- k8s/helm/templates/tests/nccl-test.yaml | 50 +++- k8s/helm/values.yaml | 53 ++++- 12 files changed, 458 insertions(+), 22 deletions(-) create mode 100644 k8s/helm/ci/20-explicit-default-encryption-key.yaml create mode 100644 k8s/helm/templates/api-env-secret-generator.yaml create mode 100644 k8s/helm/templates/api-env-secret-upgrade-check.yaml diff --git a/k8s/helm/README.md b/k8s/helm/README.md index 35f97c0b69..58729914cf 100644 --- a/k8s/helm/README.md +++ b/k8s/helm/README.md @@ -4,6 +4,24 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. +## Platform Secrets Encryption Key + +The platform secrets service reads `NMP_SECRETS_DEFAULT_ENCRYPTION_KEY` from the +API env Secret. The value must be base64-encoded and decode to at least 32 bytes. + +Set `secrets.defaultEncryptionKey.value` to provide your own key. When that value +is empty and `envFromSecret` is not set, the chart runs a pre-install hook that +creates `-api-env` with a per-install random key. The hook is +install-only and refuses to patch or rotate an existing Secret. + +Set `envFromSecret` to use a fully user-managed API env Secret. In that mode the +chart does not create or generate the API env Secret. + +On upgrade, the generated Secret must already exist and contain +`NMP_SECRETS_DEFAULT_ENCRYPTION_KEY`. If it is missing, restore the original +Secret instead of generating a replacement key; existing encrypted platform +secrets will not decrypt with a new key. + ## Values | Key | Type | Default | Description | @@ -131,7 +149,7 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. | core.storage.volumePermissionsImage | string | `"busybox"` | volumePermissionsImage is the image used to set permissions on the volume | | core.telemetry | object | `{}` | OpenTelemetry configuration overrides for the platform deployment. | | env | object | `{}` | Environment variables that will be applied to every deployment pod. Uses a simple key value map structure like MY_ENV_VAR: the-key and works with valueFrom as well. | -| envFromSecret | string | `""` | Optional. Name of an existing Kubernetes Secret to load as env vars (envFrom) for the API pod. When set, the chart does not create the default api-env secret; use your own secret (e.g. from Vault, sealed-secrets). When unset, the chart creates a default secret with the environment variable NMP_SECRETS_DEFAULT_ENCRYPTION_KEY for default installation. See the NeMo Platform documentation for more details on secrets encryption. | +| envFromSecret | string | `""` | Optional. Name of an existing Kubernetes Secret to load as env vars (envFrom) for the API pod. When set, the chart does not create or generate the default api-env Secret; use your own Secret (for example, from Vault or sealed-secrets). | | envoyProxy | object | This object has the following default values for the envoy proxy configuration. | Envoy proxy configuration settings. Resources are created only when platform config has auth.enabled: true (see platformConfig.auth.enabled). | | envoyProxy.adminPort | int | `9901` | Envoy Admin port | | envoyProxy.affinity | object | `{}` | Affinity configuration for the Envoy pods. | @@ -222,8 +240,8 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. | multinodeNetworking.oci.enabled | bool | `false` | Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration | | multinodeNetworking.oci.rdmaDevicesPerGPU | int | `8` | Number of RDMA devices (mlnxnics) to request per GPU | | nameOverride | string | `""` | Overrides for name and fullname templates | -| ncclTest | object | `{"configMapCleanupJob":{"image":{"repository":"bitnami/kubectl","tag":"latest"}},"gpuNodeLabelKey":"nvidia.com/gpu.present","gpuNodeLabelValue":"true","gpuResourceKey":"nvidia.com/gpu","gpusPerNode":1,"iterations":3,"orchestrator":{"image":{"repository":"docker.io/library/python","tag":"3.12-slim"},"resources":{"limits":{"cpu":"1","memory":"512Mi"},"requests":{"cpu":"100m","memory":"256Mi"}}},"validation":{"minBandwidthMBpsAt1024MB":8000},"waitTimeoutSeconds":900,"worker":{"image":{"repository":"nvcr.io/nvidia/nemo-platform/nmp-automodel-training","tag":""},"resources":{"limits":{"cpu":"8","memory":"16Gi"},"requests":{"cpu":"4","memory":"8Gi"}}}}` | NCCL chart test (`helm test`): multi-node allreduce check. Templates use helm.sh/hook: test — they are not created on install/upgrade, only when you run helm test. Requires nodes labeled with gpuNodeLabelKey/gpuNodeLabelValue (default NFD / GPU operator style). See https://helm.sh/docs/topics/chart_tests/ | -| ncclTest.configMapCleanupJob | object | `{"image":{"repository":"bitnami/kubectl","tag":"latest"}}` | Post-test hook Job (after orchestrator): kubectl deletes the scripts ConfigMap (helm.sh/hook-weight 5). | +| ncclTest | object | `{"configMapCleanupJob":{"image":{"repository":"docker.io/library/python","tag":"3.12-slim"}},"gpuNodeLabelKey":"nvidia.com/gpu.present","gpuNodeLabelValue":"true","gpuResourceKey":"nvidia.com/gpu","gpusPerNode":1,"iterations":3,"orchestrator":{"image":{"repository":"docker.io/library/python","tag":"3.12-slim"},"resources":{"limits":{"cpu":"1","memory":"512Mi"},"requests":{"cpu":"100m","memory":"256Mi"}}},"validation":{"minBandwidthMBpsAt1024MB":8000},"waitTimeoutSeconds":900,"worker":{"image":{"repository":"nvcr.io/nvidia/nemo-platform/nmp-automodel-training","tag":""},"resources":{"limits":{"cpu":"8","memory":"16Gi"},"requests":{"cpu":"4","memory":"8Gi"}}}}` | NCCL chart test (`helm test`): multi-node allreduce check. Templates use helm.sh/hook: test — they are not created on install/upgrade, only when you run helm test. Requires nodes labeled with gpuNodeLabelKey/gpuNodeLabelValue (default NFD / GPU operator style). See https://helm.sh/docs/topics/chart_tests/ | +| ncclTest.configMapCleanupJob | object | `{"image":{"repository":"docker.io/library/python","tag":"3.12-slim"}}` | Post-test hook Job (after orchestrator): deletes the scripts ConfigMap (helm.sh/hook-weight 5). | | ncclTest.gpuNodeLabelKey | string | `"nvidia.com/gpu.present"` | Node label used to discover GPU workers (must match your cluster). | | ncclTest.gpuResourceKey | string | `"nvidia.com/gpu"` | Resource name for GPU capacity on worker pods (e.g. nvidia.com/gpu or a MIG device). | | ncclTest.gpusPerNode | int | `1` | GPUs per worker pod / per node (torch.distributed nproc_per_node). IMPORTANT: Set this value before testing | @@ -275,6 +293,25 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. | rbac | object | `{"k8sNimOperatorEnabled":true,"volcanoEnabled":true}` | RBAC configuration settings for optional dependencies | | rbac.k8sNimOperatorEnabled | bool | `true` | Specifies whether to enable the core Controller to have RBAC permissions to k8s-nim-operator's NIMService for scheduling NIMs. | | rbac.volcanoEnabled | bool | `true` | Specifies whether to enable the core Controller to have RBAC permissions to Volcano for scheduling distributed jobs. | +| secrets | object | `{"defaultEncryptionKey":{"generated":{"activeDeadlineSeconds":120,"affinity":{},"backoffLimit":3,"enabled":true,"image":{"pullPolicy":"IfNotPresent","repository":"docker.io/library/python","tag":"3.12-slim"},"nodeSelector":{},"podSecurityContext":{},"resources":{},"securityContext":{},"serviceAccount":{"annotations":{},"automount":true,"create":true,"name":""},"tolerations":[],"ttlSecondsAfterFinished":300},"value":""}}` | Secrets service configuration. | +| secrets.defaultEncryptionKey.generated | object | `{"activeDeadlineSeconds":120,"affinity":{},"backoffLimit":3,"enabled":true,"image":{"pullPolicy":"IfNotPresent","repository":"docker.io/library/python","tag":"3.12-slim"},"nodeSelector":{},"podSecurityContext":{},"resources":{},"securityContext":{},"serviceAccount":{"annotations":{},"automount":true,"create":true,"name":""},"tolerations":[],"ttlSecondsAfterFinished":300}` | Generated key configuration used only when value and envFromSecret are empty. The generated key is not rotated or recreated on upgrade. | +| secrets.defaultEncryptionKey.generated.activeDeadlineSeconds | int | `120` | Maximum seconds for the key generation hook to run. | +| secrets.defaultEncryptionKey.generated.affinity | object | `{}` | Affinity for the key generation hook. | +| secrets.defaultEncryptionKey.generated.backoffLimit | int | `3` | Number of retries before the key generation hook is marked failed. | +| secrets.defaultEncryptionKey.generated.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy for the pre-install key generation hook. | +| secrets.defaultEncryptionKey.generated.image.repository | string | `"docker.io/library/python"` | Image repository for the pre-install key generation hook. | +| secrets.defaultEncryptionKey.generated.image.tag | string | `"3.12-slim"` | Image tag for the pre-install key generation hook. | +| secrets.defaultEncryptionKey.generated.nodeSelector | object | `{}` | Node selector for the key generation hook. | +| secrets.defaultEncryptionKey.generated.podSecurityContext | object | `{}` | Optional pod security context for the key generation hook. | +| secrets.defaultEncryptionKey.generated.resources | object | `{}` | Optional resource limits/requests for the key generation hook. | +| secrets.defaultEncryptionKey.generated.securityContext | object | `{}` | Optional container security context for the key generation hook. | +| secrets.defaultEncryptionKey.generated.serviceAccount.annotations | object | `{}` | Annotations to add to the key generation hook service account. | +| secrets.defaultEncryptionKey.generated.serviceAccount.automount | bool | `true` | Automatically mount the ServiceAccount's API credentials. | +| secrets.defaultEncryptionKey.generated.serviceAccount.create | bool | `true` | Specifies whether a service account should be created for the key generation hook. | +| secrets.defaultEncryptionKey.generated.serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template. | +| secrets.defaultEncryptionKey.generated.tolerations | list | `[]` | Tolerations for the key generation hook. | +| secrets.defaultEncryptionKey.generated.ttlSecondsAfterFinished | int | `300` | Seconds to keep the key generation hook Job after it finishes, if the hook is not deleted first. | +| secrets.defaultEncryptionKey.value | string | `""` | Optional base64-encoded key for encrypting platform secrets. The decoded key must be at least 32 bytes. If empty and envFromSecret is not set, a pre-install hook generates a per-install key. | | securityContext | object | This object has the following default values for the container security context. | Container security context settings applied to all services by default. These can be overridden in individual service configurations. | | telemetry.OTEL_EXPORTER_OTLP_ENDPOINT | string | `""` | The OpenTelemetry grpc collector endpoint to export traces and metrics to. | | telemetry.OTEL_EXPORTER_OTLP_INSECURE | bool | `true` | Whether to use an insecure connection (no TLS) to the OpenTelemetry collector endpoint. | diff --git a/k8s/helm/ci/14-secretfromenv.yaml b/k8s/helm/ci/14-secretfromenv.yaml index 138af4b793..16cea74d08 100644 --- a/k8s/helm/ci/14-secretfromenv.yaml +++ b/k8s/helm/ci/14-secretfromenv.yaml @@ -1,7 +1,6 @@ -# Test envFromSecret: use an external secret for API pod env vars (no default api-env secret created). +# Test envFromSecret: use an external secret for API pod env vars (no default api-env secret created or generated). # CI validates by running: helm template nemo-platform . -f ci/14-secretfromenv.yaml # When envFromSecret is set: the api-env Secret template must not render; the API deployment # must reference this secret name in envFrom. envFromSecret: "my-external-api-env" - diff --git a/k8s/helm/ci/20-explicit-default-encryption-key.yaml b/k8s/helm/ci/20-explicit-default-encryption-key.yaml new file mode 100644 index 0000000000..cab0907ae7 --- /dev/null +++ b/k8s/helm/ci/20-explicit-default-encryption-key.yaml @@ -0,0 +1,8 @@ +# Test explicit secrets.defaultEncryptionKey.value: render the chart-managed API env Secret +# with a user-provided base64 key instead of the pre-install key generation hook. +# CI validates by running: helm template nemo-platform . -f ci/20-explicit-default-encryption-key.yaml + +secrets: + defaultEncryptionKey: + # Base64 for 32 zero bytes. This is only a deterministic render-test value. + value: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" diff --git a/k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl b/k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl index 38fb8da57f..d0cae630ca 100644 --- a/k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl +++ b/k8s/helm/helm-docs-template/nemo-helm-readme.md.gotmpl @@ -4,4 +4,22 @@ Documentation can be found at: https://docs.nvidia.com/nemo/microservices/. +## Platform Secrets Encryption Key + +The platform secrets service reads `NMP_SECRETS_DEFAULT_ENCRYPTION_KEY` from the +API env Secret. The value must be base64-encoded and decode to at least 32 bytes. + +Set `secrets.defaultEncryptionKey.value` to provide your own key. When that value +is empty and `envFromSecret` is not set, the chart runs a pre-install hook that +creates `-api-env` with a per-install random key. The hook is +install-only and refuses to patch or rotate an existing Secret. + +Set `envFromSecret` to use a fully user-managed API env Secret. In that mode the +chart does not create or generate the API env Secret. + +On upgrade, the generated Secret must already exist and contain +`NMP_SECRETS_DEFAULT_ENCRYPTION_KEY`. If it is missing, restore the original +Secret instead of generating a replacement key; existing encrypted platform +secrets will not decrypt with a new key. + {{ template "chart.valuesSection" . }} diff --git a/k8s/helm/templates/_helpers.tpl b/k8s/helm/templates/_helpers.tpl index 2be28e74bc..b12d221a61 100644 --- a/k8s/helm/templates/_helpers.tpl +++ b/k8s/helm/templates/_helpers.tpl @@ -148,6 +148,61 @@ Pod annotations checksum/config: {{ include (print $.Template.BasePath "/platform-configmap.yaml") . | sha256sum }} {{- end -}} +{{/* +Name of the API environment Secret. This Secret provides environment variables +loaded via envFrom, including the secrets service default encryption key when the +chart manages it. +*/}} +{{- define "nemo-platform.apiEnvSecretName" -}} +{{- .Values.envFromSecret | default (printf "%s-api-env" (include "nemo-platform.fullname" .)) -}} +{{- end -}} + +{{/* +Environment variable name used by the secrets service secret_key provider. +*/}} +{{- define "nemo-platform.defaultEncryptionKeyEnvName" -}} +NMP_SECRETS_DEFAULT_ENCRYPTION_KEY +{{- end -}} + +{{/* +Whether the chart should generate the API env Secret through a pre-install hook. +Generation is install-only. On upgrade, a missing generated key is unrecoverable +without restoring the original key or rotating/re-encrypting secrets through the +supported admin flow, so the chart must not generate a replacement. +*/}} +{{- define "nemo-platform.generateDefaultEncryptionKey" -}} +{{- if and .Release.IsInstall (not .Values.envFromSecret) (not .Values.secrets.defaultEncryptionKey.value) .Values.secrets.defaultEncryptionKey.generated.enabled -}} +true +{{- end -}} +{{- end -}} + +{{/* +Whether an upgrade should require the generated API env Secret to already exist. +*/}} +{{- define "nemo-platform.requireExistingGeneratedDefaultEncryptionKey" -}} +{{- if and .Release.IsUpgrade (not .Values.envFromSecret) (not .Values.secrets.defaultEncryptionKey.value) .Values.secrets.defaultEncryptionKey.generated.enabled -}} +true +{{- end -}} +{{- end -}} + +{{/* +Name shared by the key generation hook RBAC and Job resources. +*/}} +{{- define "nemo-platform.defaultEncryptionKeyGeneratorName" -}} +{{- printf "%s-api-env-keygen" (include "nemo-platform.fullname" .) | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +ServiceAccount name for the key generation hook. +*/}} +{{- define "nemo-platform.defaultEncryptionKeyGeneratorServiceAccountName" -}} +{{- if .Values.secrets.defaultEncryptionKey.generated.serviceAccount.create -}} +{{- default (include "nemo-platform.defaultEncryptionKeyGeneratorName" .) .Values.secrets.defaultEncryptionKey.generated.serviceAccount.name -}} +{{- else -}} +{{- default "default" .Values.secrets.defaultEncryptionKey.generated.serviceAccount.name -}} +{{- end -}} +{{- end -}} + {{/* Custom image pull secret if not defined */}} diff --git a/k8s/helm/templates/api-env-secret-generator.yaml b/k8s/helm/templates/api-env-secret-generator.yaml new file mode 100644 index 0000000000..7ba96c1f00 --- /dev/null +++ b/k8s/helm/templates/api-env-secret-generator.yaml @@ -0,0 +1,220 @@ +{{- if include "nemo-platform.generateDefaultEncryptionKey" . }} +{{- $hookAnnotations := dict "helm.sh/hook" "pre-install" "helm.sh/hook-weight" "-20" "helm.sh/hook-delete-policy" "before-hook-creation,hook-succeeded" -}} +{{- $rbacAnnotations := dict "helm.sh/hook" "pre-install" "helm.sh/hook-weight" "-30" "helm.sh/hook-delete-policy" "before-hook-creation,hook-succeeded" -}} +{{- if .Values.secrets.defaultEncryptionKey.generated.serviceAccount.create }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nemo-platform.defaultEncryptionKeyGeneratorServiceAccountName" . }} + labels: + app.kubernetes.io/component: api-env-keygen + {{- include "nemo-platform.labels" . | nindent 4 }} + annotations: + {{- toYaml (mergeOverwrite (deepCopy $rbacAnnotations) (.Values.secrets.defaultEncryptionKey.generated.serviceAccount.annotations | default dict)) | nindent 4 }} +automountServiceAccountToken: {{ .Values.secrets.defaultEncryptionKey.generated.serviceAccount.automount }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "nemo-platform.defaultEncryptionKeyGeneratorName" . }} + labels: + app.kubernetes.io/component: api-env-keygen + {{- include "nemo-platform.labels" . | nindent 4 }} + annotations: + {{- toYaml $rbacAnnotations | nindent 4 }} +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "nemo-platform.defaultEncryptionKeyGeneratorName" . }} + labels: + app.kubernetes.io/component: api-env-keygen + {{- include "nemo-platform.labels" . | nindent 4 }} + annotations: + {{- toYaml $rbacAnnotations | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "nemo-platform.defaultEncryptionKeyGeneratorName" . }} +subjects: + - kind: ServiceAccount + name: {{ include "nemo-platform.defaultEncryptionKeyGeneratorServiceAccountName" . }} + namespace: {{ .Release.Namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "nemo-platform.defaultEncryptionKeyGeneratorName" . }} + labels: + app.kubernetes.io/component: api-env-keygen + {{- include "nemo-platform.labels" . | nindent 4 }} + annotations: + {{- toYaml $hookAnnotations | nindent 4 }} +spec: + ttlSecondsAfterFinished: {{ .Values.secrets.defaultEncryptionKey.generated.ttlSecondsAfterFinished }} + backoffLimit: {{ .Values.secrets.defaultEncryptionKey.generated.backoffLimit }} + activeDeadlineSeconds: {{ .Values.secrets.defaultEncryptionKey.generated.activeDeadlineSeconds }} + template: + metadata: + labels: + app.kubernetes.io/component: api-env-keygen + {{- include "nemo-platform.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: Never + serviceAccountName: {{ include "nemo-platform.defaultEncryptionKeyGeneratorServiceAccountName" . }} + imagePullSecrets: + {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} + securityContext: + {{- toYaml .Values.secrets.defaultEncryptionKey.generated.podSecurityContext | nindent 8 }} + containers: + - name: api-env-keygen + image: "{{ .Values.secrets.defaultEncryptionKey.generated.image.repository }}:{{ .Values.secrets.defaultEncryptionKey.generated.image.tag }}" + imagePullPolicy: {{ .Values.secrets.defaultEncryptionKey.generated.image.pullPolicy }} + securityContext: + {{- toYaml .Values.secrets.defaultEncryptionKey.generated.securityContext | nindent 12 }} + command: + - python + - -c + args: + - | + import base64 + import json + import os + import ssl + import sys + import urllib.error + import urllib.request + + secret_name = os.environ["SECRET_NAME"] + key_name = os.environ["SECRET_KEY"] + namespace = os.environ["POD_NAMESPACE"] + api_host = os.environ["KUBERNETES_SERVICE_HOST"] + api_port = os.environ.get("KUBERNETES_SERVICE_PORT", "443") + token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + + with open(token_path, encoding="utf-8") as token_file: + token = token_file.read().strip() + + context = ssl.create_default_context(cafile=ca_path) + + def api_request(path, method="GET", body=None): + data = None + headers = { + "Authorization": "Bearer " + token, + "Accept": "application/json", + } + if body is not None: + data = json.dumps(body).encode("utf-8") + headers["Content-Type"] = "application/json" + request = urllib.request.Request( + "https://{}:{}{}".format(api_host, api_port, path), + data=data, + headers=headers, + method=method, + ) + return urllib.request.urlopen(request, context=context, timeout=30) + + secret_path = "/api/v1/namespaces/{}/secrets/{}".format(namespace, secret_name) + try: + with api_request(secret_path) as response: + secret = json.load(response) + except urllib.error.HTTPError as exc: + if exc.code != 404: + raise + secret = None + + if secret is not None: + if secret.get("data", {}).get(key_name): + print( + "Secret {}/{} already contains {}; leaving it unchanged.".format( + namespace, + secret_name, + key_name, + ) + ) + sys.exit(0) + print( + "Secret {}/{} exists but does not contain {}; refusing to rotate or patch KEK.".format( + namespace, + secret_name, + key_name, + ), + file=sys.stderr, + ) + sys.exit(1) + + key = base64.b64encode(os.urandom(32)).decode("ascii") + body = { + "apiVersion": "v1", + "kind": "Secret", + "metadata": { + "name": secret_name, + "namespace": namespace, + "labels": { + "app.kubernetes.io/managed-by": {{ .Release.Service | quote }}, + "app.kubernetes.io/name": {{ include "nemo-platform.name" . | quote }}, + "app.kubernetes.io/instance": {{ .Release.Name | quote }}, + "app.kubernetes.io/part-of": "nemo-platform", + }, + "annotations": { + "helm.sh/resource-policy": "keep", + }, + }, + "type": "Opaque", + "stringData": { + key_name: key, + }, + } + + try: + with api_request("/api/v1/namespaces/{}/secrets".format(namespace), method="POST", body=body) as response: + if response.status not in (200, 201): + raise RuntimeError("unexpected create Secret response {}".format(response.status)) + except urllib.error.HTTPError as exc: + if exc.code != 409: + raise + with api_request(secret_path) as response: + secret = json.load(response) + if secret.get("data", {}).get(key_name): + print( + "Secret {}/{} already contains {}; leaving it unchanged.".format( + namespace, + secret_name, + key_name, + ) + ) + sys.exit(0) + raise + + print("Created Secret {}/{} with generated {}.".format(namespace, secret_name, key_name)) + env: + - name: SECRET_NAME + value: {{ include "nemo-platform.apiEnvSecretName" . | quote }} + - name: SECRET_KEY + value: {{ include "nemo-platform.defaultEncryptionKeyEnvName" . | quote }} + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + {{- toYaml .Values.secrets.defaultEncryptionKey.generated.resources | nindent 12 }} + {{- with .Values.secrets.defaultEncryptionKey.generated.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.secrets.defaultEncryptionKey.generated.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.secrets.defaultEncryptionKey.generated.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/k8s/helm/templates/api-env-secret-upgrade-check.yaml b/k8s/helm/templates/api-env-secret-upgrade-check.yaml new file mode 100644 index 0000000000..152befbcbe --- /dev/null +++ b/k8s/helm/templates/api-env-secret-upgrade-check.yaml @@ -0,0 +1,11 @@ +{{- if include "nemo-platform.requireExistingGeneratedDefaultEncryptionKey" . }} +{{- $secretName := include "nemo-platform.apiEnvSecretName" . -}} +{{- $keyName := include "nemo-platform.defaultEncryptionKeyEnvName" . -}} +{{- $secret := lookup "v1" "Secret" .Release.Namespace $secretName -}} +{{- if not $secret -}} +{{- fail (printf "generated API env Secret %s/%s is missing during upgrade; restore the original Secret instead of generating a new encryption key" .Release.Namespace $secretName) -}} +{{- end -}} +{{- if not (hasKey ($secret.data | default dict) $keyName) -}} +{{- fail (printf "generated API env Secret %s/%s is missing key %s during upgrade; restore the original key instead of generating a new encryption key" .Release.Namespace $secretName $keyName) -}} +{{- end -}} +{{- end }} diff --git a/k8s/helm/templates/api-env-secret.yaml b/k8s/helm/templates/api-env-secret.yaml index 96d7386d79..3cb04be081 100644 --- a/k8s/helm/templates/api-env-secret.yaml +++ b/k8s/helm/templates/api-env-secret.yaml @@ -1,13 +1,22 @@ {{- if not .Values.envFromSecret }} +{{- if .Values.secrets.defaultEncryptionKey.value }} --- apiVersion: v1 kind: Secret metadata: - name: {{ include "nemo-platform.fullname" . }}-api-env + name: {{ include "nemo-platform.apiEnvSecretName" . }} labels: {{- include "nemo-platform.labels" . | nindent 4 }} type: Opaque stringData: - # This is the default encryption key for encrypting secrets on the platform. - NMP_SECRETS_DEFAULT_ENCRYPTION_KEY: "f4NPSp39YN5oWTwZ3iDX/L3PTvEH8qFvUs1noC/jWuo=" + {{ include "nemo-platform.defaultEncryptionKeyEnvName" . }}: {{ .Values.secrets.defaultEncryptionKey.value | quote }} +{{- else if include "nemo-platform.generateDefaultEncryptionKey" . }} +{{- /* The pre-install hook creates the API env Secret with a generated key. */ -}} +{{- else if include "nemo-platform.requireExistingGeneratedDefaultEncryptionKey" . }} +{{- /* The upgrade guard verifies the existing generated Secret. */ -}} +{{- else if and (not .Release.IsInstall) (not .Release.IsUpgrade) }} +{{- /* helm lint evaluates templates without install or upgrade mode. */ -}} +{{- else }} +{{- fail "set envFromSecret, secrets.defaultEncryptionKey.value, or enable secrets.defaultEncryptionKey.generated.enabled" }} +{{- end }} {{- end }} diff --git a/k8s/helm/templates/api/api-deployment.yaml b/k8s/helm/templates/api/api-deployment.yaml index 60a19826ad..e842574be1 100644 --- a/k8s/helm/templates/api/api-deployment.yaml +++ b/k8s/helm/templates/api/api-deployment.yaml @@ -52,7 +52,7 @@ spec: {{- end }} envFrom: - secretRef: - name: {{ .Values.envFromSecret | default (printf "%s-api-env" (include "nemo-platform.fullname" .)) }} + name: {{ include "nemo-platform.apiEnvSecretName" . }} env: - name: NMP_CONFIG_FILE_PATH value: /etc/nmp/config.yaml diff --git a/k8s/helm/templates/platform-seed-job.yaml b/k8s/helm/templates/platform-seed-job.yaml index e8505c8b46..a4c7f376df 100644 --- a/k8s/helm/templates/platform-seed-job.yaml +++ b/k8s/helm/templates/platform-seed-job.yaml @@ -46,7 +46,7 @@ spec: - nmp.platform_seed envFrom: - secretRef: - name: {{ .Values.envFromSecret | default (printf "%s-api-env" (include "nemo-platform.fullname" .)) }} + name: {{ include "nemo-platform.apiEnvSecretName" . }} env: - name: NMP_CONFIG_FILE_PATH value: /etc/nmp/config.yaml diff --git a/k8s/helm/templates/tests/nccl-test.yaml b/k8s/helm/templates/tests/nccl-test.yaml index 7a7e257f12..450694d324 100644 --- a/k8s/helm/templates/tests/nccl-test.yaml +++ b/k8s/helm/templates/tests/nccl-test.yaml @@ -295,7 +295,7 @@ spec: imagePullSecrets: {{ include "nemo-common.imagepullsecrets" . | nindent 8 | trim }} containers: - - name: kubectl + - name: configmap-cleanup image: "{{ .Values.ncclTest.configMapCleanupJob.image.repository }}:{{ .Values.ncclTest.configMapCleanupJob.image.tag }}" imagePullPolicy: IfNotPresent env: @@ -305,10 +305,48 @@ spec: fieldPath: metadata.namespace - name: CM_NAME value: {{ printf "%s-cm" $ncclTestBase | trunc 63 | quote }} - command: ["/bin/bash", "-lc"] + command: + - python + - -c args: - | - set -e - echo "Deleting ConfigMap ${CM_NAME} in ${NAMESPACE}" - kubectl delete configmap "${CM_NAME}" -n "${NAMESPACE}" --ignore-not-found - echo "ConfigMap cleanup done." + import os + import ssl + import urllib.error + import urllib.request + + namespace = os.environ["NAMESPACE"] + configmap_name = os.environ["CM_NAME"] + api_host = os.environ["KUBERNETES_SERVICE_HOST"] + api_port = os.environ.get("KUBERNETES_SERVICE_PORT", "443") + token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + + with open(token_path, encoding="utf-8") as token_file: + token = token_file.read().strip() + + request = urllib.request.Request( + "https://{}:{}/api/v1/namespaces/{}/configmaps/{}".format( + api_host, + api_port, + namespace, + configmap_name, + ), + headers={ + "Authorization": "Bearer " + token, + "Accept": "application/json", + }, + method="DELETE", + ) + + print("Deleting ConfigMap {} in {}".format(configmap_name, namespace)) + try: + urllib.request.urlopen( + request, + context=ssl.create_default_context(cafile=ca_path), + timeout=30, + ).close() + except urllib.error.HTTPError as exc: + if exc.code != 404: + raise + print("ConfigMap cleanup done.") diff --git a/k8s/helm/values.yaml b/k8s/helm/values.yaml index 3167f8b36b..bac76bf5e2 100644 --- a/k8s/helm/values.yaml +++ b/k8s/helm/values.yaml @@ -13,11 +13,52 @@ ngcAPIKey: YOUR-NGC-API-KEY env: {} # -- Optional. Name of an existing Kubernetes Secret to load as env vars (envFrom) for the API pod. -# When set, the chart does not create the default api-env secret; use your own secret (e.g. from Vault, sealed-secrets). -# When unset, the chart creates a default secret with the environment variable NMP_SECRETS_DEFAULT_ENCRYPTION_KEY for default installation. -# See the NeMo Platform documentation for more details on secrets encryption. +# When set, the chart does not create or generate the default api-env Secret; use your own Secret (for example, from Vault or sealed-secrets). envFromSecret: "" +# -- Secrets service configuration. +secrets: + defaultEncryptionKey: + # -- Optional base64-encoded key for encrypting platform secrets. The decoded key must be at least 32 bytes. If empty and envFromSecret is not set, a pre-install hook generates a per-install key. + value: "" + # -- Generated key configuration used only when value and envFromSecret are empty. The generated key is not rotated or recreated on upgrade. + generated: + enabled: true + image: + # -- Image repository for the pre-install key generation hook. + repository: docker.io/library/python + # -- Image tag for the pre-install key generation hook. + tag: "3.12-slim" + # -- Image pull policy for the pre-install key generation hook. + pullPolicy: IfNotPresent + serviceAccount: + # -- Specifies whether a service account should be created for the key generation hook. + create: true + # -- Automatically mount the ServiceAccount's API credentials. + automount: true + # -- Annotations to add to the key generation hook service account. + annotations: {} + # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template. + name: "" + # -- Number of retries before the key generation hook is marked failed. + backoffLimit: 3 + # -- Maximum seconds for the key generation hook to run. + activeDeadlineSeconds: 120 + # -- Seconds to keep the key generation hook Job after it finishes, if the hook is not deleted first. + ttlSecondsAfterFinished: 300 + # -- Optional pod security context for the key generation hook. + podSecurityContext: {} + # -- Optional container security context for the key generation hook. + securityContext: {} + # -- Optional resource limits/requests for the key generation hook. + resources: {} + # -- Node selector for the key generation hook. + nodeSelector: {} + # -- Affinity for the key generation hook. + affinity: {} + # -- Tolerations for the key generation hook. + tolerations: [] + # -- You can use an existing Kubernetes secret for communicating with the NGC API for downloading models. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string. existingSecret: ngc-api @@ -108,11 +149,11 @@ ncclTest: limits: cpu: "1" memory: 512Mi - # -- Post-test hook Job (after orchestrator): kubectl deletes the scripts ConfigMap (helm.sh/hook-weight 5). + # -- Post-test hook Job (after orchestrator): deletes the scripts ConfigMap (helm.sh/hook-weight 5). configMapCleanupJob: image: - repository: bitnami/kubectl - tag: "latest" + repository: docker.io/library/python + tag: "3.12-slim" worker: image: repository: nvcr.io/nvidia/nemo-platform/nmp-automodel-training