diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/README.md b/training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/README.md index 289f6b1..9b2701d 100644 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/README.md +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/README.md @@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-405b +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/README.md b/training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/README.md index 28f525f..0733b2e 100644 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/README.md +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/README.md @@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-405b/nemo-pretraining-gke/32_nodes +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/README.md b/training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/README.md index b402f3f..e133660 100644 --- a/training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/README.md +++ b/training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/README.md @@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-70b/nemo-pretraining-gke/16_nodes +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/README.md b/training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/README.md index ee39cd6..ca5d0d8 100644 --- a/training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/README.md +++ b/training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/README.md @@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-70b/nemo-pretraining-gke/32_nodes +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md b/training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md index ec95a07..139514a 100644 --- a/training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md +++ b/training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md @@ -67,11 +67,11 @@ Set the default project: Clone the `gpu-recipes` repository and set a reference to the recipe folder. -``` +``` git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-70b/nemo-pretraining-gke/64_nodes +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/README.md b/training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/README.md index ed347fb..58d1e93 100644 --- a/training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/README.md +++ b/training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/README.md @@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-8b/nemo-pretraining-gke/16_nodes +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/Chart.yaml b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/Chart.yaml new file mode 100644 index 0000000..af46c11 --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/README.md b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/README.md new file mode 100644 index 0000000..b7e3725 --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/README.md @@ -0,0 +1,151 @@ + +# Pretrain llama3-8b workloads on a4x GKE Node pools with Nvidia Megatron-Bridge Framework + +This recipe outlines the steps for running a llama3-8b pretraining +workload on [a4x GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA Megatron-Bridge framework](https://github.com/NVIDIA-NeMo/Megatron-Bridge). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [Megatron-Bridge pretraining workload](https://github.com/NVIDIA-NeMo/Megatron-Bridge). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4x) +to create your a4x GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the Megatron-Bridge framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.11` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.0` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4x`. Make sure to verify the name of the local queue in your cluster. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 2 node (8 gpus) fp8-cs precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4x-llama3-8b-2node +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set workload.image=nvcr.io/nvidia/nemo:25.11 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4x-llama3-8b-2node + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set workload.image=nvcr.io/nvidia/nemo:25.11 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4x-llama3-8b-2node +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-llama3-8b-2node. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4x-llama3-8b-2node-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4x-llama3-8b-2node +``` \ No newline at end of file diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/launcher.sh b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/launcher.sh new file mode 100644 index 0000000..39d2db4 --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/launcher.sh @@ -0,0 +1,135 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" +ldconfig $LD_LIBRARY_PATH +echo "Added $LD_LIBRARY_PATH to ldconfig:" +ldconfig -p | grep libcuda | sed 's/^/ /' +echo "" + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + + + +cd /opt +rm -rf Megatron-Bridge +git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +cd Megatron-Bridge +git checkout 7695d4acbfac19353d20e456509117efe4733d6b +ls + + + +worker_command=$(cat <<- EOM + if [ "\$RANK" -eq "0" ]; then + echo "Worker 0 is stalling for a few seconds.." ; + sleep 3 ; + echo "The detected environment within worker rank 0 is:" ; + env | sed 's/^/ /' ; + fi ; + + cd /opt/Megatron-Bridge ; + + numactl \ + --cpunodebind=\$((LOCAL_RANK/2)) \ + --membind=\$((LOCAL_RANK/2)) nsys profile \ + -t nvtx,cuda \ + --cuda-event-trace=false \ + --sample=none \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + --kill none \ + -o /${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK \ + --force-overwrite true \ + --session-new "nsys-\$RANDOM-\$RANK" \ + nice -10 \ + python scripts/performance/run_script.py \ + --gpu gb200 \ + --model_family_name llama \ + --model_recipe_name llama3_8b \ + --num_gpus 8 \ + --gpus_per_node 4 \ + --compute_dtype fp8_cs \ + --global_batch_size 128 \ + --micro_batch_size 4 \ + --seq_length 8192 \ + --tensor_model_parallel_size 1 \ + --pipeline_model_parallel_size 1 \ + --context_parallel_size 1 \ + --virtual_pipeline_model_parallel_size None \ + --expert_model_parallel_size 1 \ + --max_steps 50 + +EOM +) + +echo "$worker_command" > worker_command.sh +chmod 777 worker_command.sh + +torchrun \ +--nproc-per-node="4" \ +--nnodes="2" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +--no-python bash worker_command.sh + + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/recipe_launch_command.sh b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/recipe_launch_command.sh new file mode 100644 index 0000000..f78e94b --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install joeywan-ubench-759y . -f values.yaml --set-file workload_launcher=launcher.sh --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/joeywan-ubench-759y --set queue=tas-lq \ No newline at end of file diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-config-configmap.yaml b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-config-configmap.yaml new file mode 100644 index 0000000..a1d54ce --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-job.yaml b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-job.yaml new file mode 100644 index 0000000..e2b6d54 --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-job.yaml @@ -0,0 +1,352 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 4 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 4}} +{{- $root := . -}} +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + name: "{{ .Release.Name }}-{{ $jobSuffix }}" +spec: + numNodes: {{ $nodes }} + channel: + resourceClaimTemplate: + name: "{{ .Release.Name }}-{{ $jobSuffix }}" +--- +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 5 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 4 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 3 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + - key: "kubernetes.io/arch" + operator: "Equal" + value: "arm64" + effect: "NoSchedule" + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: "{{ .Release.Name }}-{{ $jobSuffix }}" + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a3u.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} + claims: + - name: compute-domain-channel diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-launcher-configmap.yaml b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-launcher-configmap.yaml new file mode 100644 index 0000000..7026e0f --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-svc.yaml b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-svc.yaml new file mode 100644 index 0000000..7cfe220 --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/values.yaml b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/values.yaml new file mode 100644 index 0000000..2727823 --- /dev/null +++ b/training/a4x/llama3-8b/megatron-bridge-pretraining-gke/2node-FP8CS-GBS128/recipe/values.yaml @@ -0,0 +1,31 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.0 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: null + configPath: null + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + gpus: 8 + image: nvcr.io/nvidia/nemo:25.11 diff --git a/training/a4x/nemotron4-340B/nemo-pretraining-gke/16node-FP8CS-GBS256/recipe/README.md b/training/a4x/nemotron4-340B/nemo-pretraining-gke/16node-FP8CS-GBS256/recipe/README.md index 309fae5..0c47c87 100644 --- a/training/a4x/nemotron4-340B/nemo-pretraining-gke/16node-FP8CS-GBS256/recipe/README.md +++ b/training/a4x/nemotron4-340B/nemo-pretraining-gke/16node-FP8CS-GBS256/recipe/README.md @@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/nemotron4-340b/nemo-pretraining-gke/16_nodes +export RECIPE_ROOT=$REPO_ROOT/training/a4x/nemotron4-340B/nemo-pretraining-gke/16node-FP8CS-GBS256/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/nemotron4-340B/nemo-pretraining-gke/32node-FP8CS-GBS256/recipe/README.md b/training/a4x/nemotron4-340B/nemo-pretraining-gke/32node-FP8CS-GBS256/recipe/README.md index 9b31d85..69d5ab0 100644 --- a/training/a4x/nemotron4-340B/nemo-pretraining-gke/32node-FP8CS-GBS256/recipe/README.md +++ b/training/a4x/nemotron4-340B/nemo-pretraining-gke/32node-FP8CS-GBS256/recipe/README.md @@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/nemotron4-340b/nemo-pretraining-gke/32_nodes +export RECIPE_ROOT=$REPO_ROOT/training/a4x/nemotron4-340B/nemo-pretraining-gke/32node-FP8CS-GBS256/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/Chart.yaml b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/Chart.yaml new file mode 100644 index 0000000..af46c11 --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/README.md b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/README.md new file mode 100644 index 0000000..c296fdd --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/README.md @@ -0,0 +1,151 @@ + +# Pretrain qwen3-30b workloads on a4x GKE Node pools with Nvidia Megatron-Bridge Framework + +This recipe outlines the steps for running a qwen3-30b pretraining +workload on [a4x GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA Megatron-Bridge framework](https://github.com/NVIDIA-NeMo/Megatron-Bridge). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [Megatron-Bridge pretraining workload](https://github.com/NVIDIA-NeMo/Megatron-Bridge). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4x) +to create your a4x GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the Megatron-Bridge framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.11` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.0` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4x`. Make sure to verify the name of the local queue in your cluster. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 2 node (8 gpus) fp8-mx precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4x-qwen3-30b-2node +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set workload.image=nvcr.io/nvidia/nemo:25.11 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4x-qwen3-30b-2node + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set workload.image=nvcr.io/nvidia/nemo:25.11 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4x-qwen3-30b-2node +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-qwen3-30b-2node. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4x-qwen3-30b-2node-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4x-qwen3-30b-2node +``` \ No newline at end of file diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/launcher.sh b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/launcher.sh new file mode 100644 index 0000000..ce1a301 --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/launcher.sh @@ -0,0 +1,135 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" +ldconfig $LD_LIBRARY_PATH +echo "Added $LD_LIBRARY_PATH to ldconfig:" +ldconfig -p | grep libcuda | sed 's/^/ /' +echo "" + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + + + +cd /opt +rm -rf Megatron-Bridge +git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +cd Megatron-Bridge +git checkout 7695d4acbfac19353d20e456509117efe4733d6b +ls + + + +worker_command=$(cat <<- EOM + if [ "\$RANK" -eq "0" ]; then + echo "Worker 0 is stalling for a few seconds.." ; + sleep 3 ; + echo "The detected environment within worker rank 0 is:" ; + env | sed 's/^/ /' ; + fi ; + + cd /opt/Megatron-Bridge ; + + numactl \ + --cpunodebind=\$((LOCAL_RANK/2)) \ + --membind=\$((LOCAL_RANK/2)) nsys profile \ + -t nvtx,cuda \ + --cuda-event-trace=false \ + --sample=none \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + --kill none \ + -o /${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK \ + --force-overwrite true \ + --session-new "nsys-\$RANDOM-\$RANK" \ + nice -10 \ + python scripts/performance/run_script.py \ + --gpu gb200 \ + --model_family_name qwen \ + --model_recipe_name qwen3_30b_a3b \ + --num_gpus 8 \ + --gpus_per_node 4 \ + --compute_dtype fp8_mx \ + --global_batch_size 1024 \ + --micro_batch_size 4 \ + --seq_length 4096 \ + --tensor_model_parallel_size 1 \ + --pipeline_model_parallel_size 1 \ + --context_parallel_size 1 \ + --virtual_pipeline_model_parallel_size None \ + --expert_model_parallel_size 8 \ + --max_steps 50 + +EOM +) + +echo "$worker_command" > worker_command.sh +chmod 777 worker_command.sh + +torchrun \ +--nproc-per-node="4" \ +--nnodes="2" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +--no-python bash worker_command.sh + + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/recipe_launch_command.sh b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/recipe_launch_command.sh new file mode 100644 index 0000000..d4ff835 --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install joeywan-ubench-3n5e . -f values.yaml --set-file workload_launcher=launcher.sh --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/joeywan-ubench-3n5e --set queue=tas-lq \ No newline at end of file diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-config-configmap.yaml b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-config-configmap.yaml new file mode 100644 index 0000000..a1d54ce --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-job.yaml b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-job.yaml new file mode 100644 index 0000000..e2b6d54 --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-job.yaml @@ -0,0 +1,352 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 4 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 4}} +{{- $root := . -}} +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + name: "{{ .Release.Name }}-{{ $jobSuffix }}" +spec: + numNodes: {{ $nodes }} + channel: + resourceClaimTemplate: + name: "{{ .Release.Name }}-{{ $jobSuffix }}" +--- +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 5 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 4 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 3 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + - key: "kubernetes.io/arch" + operator: "Equal" + value: "arm64" + effect: "NoSchedule" + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: "{{ .Release.Name }}-{{ $jobSuffix }}" + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a3u.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} + claims: + - name: compute-domain-channel diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-launcher-configmap.yaml b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-launcher-configmap.yaml new file mode 100644 index 0000000..7026e0f --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-svc.yaml b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-svc.yaml new file mode 100644 index 0000000..7cfe220 --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/values.yaml b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/values.yaml new file mode 100644 index 0000000..2727823 --- /dev/null +++ b/training/a4x/qwen3-30b/megatron-bridge-pretraining-gke/2node-FP8MX-GBS1024/recipe/values.yaml @@ -0,0 +1,31 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.0 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: null + configPath: null + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + gpus: 8 + image: nvcr.io/nvidia/nemo:25.11