From 72764067bbfcbd78ce6e17a051116e3746df2164 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 28 Apr 2026 14:54:26 +0000 Subject: [PATCH 01/12] disabled rocky tests due to out-of-date kernel and base image idempotency errors --- gpu/test_gpu.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index d6c86bd8c..db64083da 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -180,8 +180,8 @@ def verify_driver_signature(self, name): def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") metadata = "install-gpu-agent=false" if configuration == 'SINGLE' \ @@ -213,8 +213,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") @@ -250,8 +250,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): @@ -300,8 +300,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") # Operation [projects/.../regions/.../operations/...] failed: # Invalid value for field 'resource.machineType': \ @@ -344,8 +344,8 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ @@ -379,8 +379,8 @@ def test_gpu_allocation(self, configuration, master_accelerator, def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ From 4d31646635b435bddef5a55cde83fa386469f938 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 03:00:34 +0000 Subject: [PATCH 02/12] Fix: Correct quoting and array usage in GPU scripts This commit addresses widespread issues in the GPU initialization scripts related to variable quoting and bash array expansion. - Consistently uses `"${array[@]}"` for expanding arrays like `curl_retry_args`, `gsutil_cmd`, and `gsutil_stat_cmd`. - Ensures variables are properly double-quoted (e.g., `"${var}"`). - Corrects quoting within `eval` statements. - Restores and corrects the logic for conditionally defining `gsutil_cmd` and `gsutil_stat_cmd` based on `gcloud --version`, using array syntax throughout. - Redirects `gsutil stat` output to `/dev/null` in `cache_fetched_package` to suppress noise. - Fixes an issue in `install_gpu_agent` where an empty `METADATA_HTTP_PROXY_PEM_URI` would cause pip to fail. These changes enhance the robustness and correctness of the scripts, particularly in environments with spaces in paths or arguments. --- gpu/install_gpu_driver.sh | 155 +++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 77 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 9a1ee94cd..6c10df5a6 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -38,7 +38,7 @@ if [[ "$(os_id)" == "rocky" ]]; else _os_version="$(os_version)" fi for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() { [[ \"$(os_id)\" == '${os_id_val}' ]] ; }" + eval "function is_${os_id_val}() { [[ \"$(os_id)\" == \"${os_id_val}\" ]] ; }" for osver in $(echo "${supported_os["${os_id_val}"]}") ; do eval "function is_${os_id_val}${osver%%.*}() { is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; }" @@ -62,9 +62,9 @@ function repair_old_backports { # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 debdists="https://deb.debian.org/debian/dists" - oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); - oldstable=$( curl ${curl_retry_args} "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); - stable=$( curl ${curl_retry_args} "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + oldoldstable=$(curl "${curl_retry_args[@]}" "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl "${curl_retry_args[@]}" "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl "${curl_retry_args[@]}" "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) @@ -81,19 +81,19 @@ function print_metadata_value() { -s -o ${tmpfile} 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. - if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then - cat ${tmpfile} + if [[ "${return_code}" == 0 && "${http_code}" == 200 ]]; then + cat "${tmpfile}" fi - rm -f ${tmpfile} - return ${return_code} + rm -f "${tmpfile}" + return "${return_code}" } function print_metadata_value_if_exists() { local return_code=1 - local readonly url=$1 - print_metadata_value ${url} + local readonly url="$1" + print_metadata_value "${url}" return_code=$? - return ${return_code} + return "${return_code}" } # replicates /usr/share/google/get_metadata_value @@ -101,14 +101,14 @@ function get_metadata_value() { local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. - print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + print_metadata_value_if_exists "${MDS_PREFIX}/instance/${varname}" return_code=$? # If the instance doesn't have the value, try the project. - if [[ ${return_code} != 0 ]]; then - print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + if [[ "${return_code}" != 0 ]]; then + print_metadata_value_if_exists "${MDS_PREFIX}/project/${varname}" return_code=$? fi - return ${return_code} + return "${return_code}" } function get_metadata_attribute() { @@ -245,10 +245,10 @@ function set_driver_version() { if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then + if curl "${curl_retry_args[@]}" --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then # use the version indicated by the cuda url as the default if it exists DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then + elif curl "${curl_retry_args[@]}" --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default DEFAULT_DRIVER="${driver_max_maj_version}" fi @@ -285,10 +285,10 @@ function set_driver_version() { # Download the file echo "Downloading from ${gpu_driver_url} to ${temp_driver_file}" - if curl -sSLf -o "${temp_driver_file}" "${gpu_driver_url}"; then + if curl "${curl_retry_args[@]}" -o "${temp_driver_file}" "${gpu_driver_url}"; then echo "Download complete. Uploading to ${gcs_cache_path}" # Upload to GCS - if gsutil cp "${temp_driver_file}" "${gcs_cache_path}"; then + if "${gsutil_cmd[@]}" cp "${temp_driver_file}" "${gcs_cache_path}"; then echo "Successfully cached to GCS." rm -f "${temp_driver_file}" else @@ -439,7 +439,7 @@ function set_cuda_runfile_url() { NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then + if ! curl "${curl_retry_args[@]}" --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead" @@ -527,7 +527,7 @@ function execute_with_retries() ( function install_cuda_keyring_pkg() { is_complete cuda-keyring-installed && return local kr_ver=1.1 - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" @@ -549,7 +549,7 @@ function install_local_cuda_repo() { readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" readonly DIST_KEYRING_DIR="/var/${pkgname}" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" @@ -557,7 +557,7 @@ function install_local_cuda_repo() { cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi @@ -577,7 +577,7 @@ function install_local_cudnn_repo() { local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${local_deb_url}" -o "${tmpdir}/local-installer.deb" dpkg -i "${tmpdir}/local-installer.deb" @@ -673,17 +673,17 @@ function install_nvidia_nccl() { if [[ "$(hostname -s)" =~ ^test-gpu && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -691,14 +691,14 @@ function install_nvidia_nccl() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" - ${gsutil_cmd} cat "${gcs_tarball}" | tar xvz + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar xvz else # build and cache touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install @@ -750,8 +750,8 @@ function install_nvidia_nccl() { make clean popd tar xzvf "${local_tarball}" - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" fi @@ -862,17 +862,17 @@ function install_pytorch() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -880,14 +880,14 @@ function install_pytorch() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" mkdir -p "${envpath}" - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C "${envpath}" -xz + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz else touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" local verb=create if test -d "${envpath}" ; then verb=install ; fi @@ -907,8 +907,8 @@ function install_pytorch() { pushd "${envpath}" tar czf "${local_tarball}" . popd - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" fi @@ -1115,17 +1115,17 @@ function build_driver_from_github() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" break fi sleep 5m @@ -1133,12 +1133,12 @@ function build_driver_from_github() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" 2>&1 ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" 2>&1 ; then echo "cache hit" else # build the kernel modules touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd open-gpu-kernel-modules install_build_dependencies @@ -1167,14 +1167,14 @@ function build_driver_from_github() { tar czvf "${local_tarball}" \ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" make clean popd fi - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C / -xzv depmod -a } @@ -1273,17 +1273,17 @@ function install_nvidia_userspace_runfile() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -1291,7 +1291,7 @@ function install_nvidia_userspace_runfile() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then cache_hit="1" if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="${runfile_args} --no-kernel-modules" @@ -1300,7 +1300,7 @@ function install_nvidia_userspace_runfile() { else # build the kernel modules touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" install_build_dependencies configure_dkms_certs @@ -1335,16 +1335,16 @@ function install_nvidia_userspace_runfile() { || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then if [[ "${cache_hit}" == "1" ]] ; then - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C / -xzv depmod -a else clear_dkms_key tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" fi fi @@ -1478,7 +1478,7 @@ function install_ops_agent(){ mkdir -p /opt/google cd /opt/google # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation - curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + curl "${curl_retry_args[@]}" -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh local expected="038d98644e4c4a7969d26da790946720d278c8d49bb82b677f550c2a2b858411 add-google-cloud-ops-agent-repo.sh" execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install @@ -1496,9 +1496,9 @@ function install_gpu_agent() { fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" @@ -1511,7 +1511,7 @@ function install_gpu_agent() { "${python_interpreter}" -m venv "${venv}" ( source "${venv}/bin/activate" - if [[ -v METADATA_HTTP_PROXY_PEM_URI ]]; then + if [[ -v METADATA_HTTP_PROXY_PEM_URI ]] && [[ -n "${METADATA_HTTP_PROXY_PEM_URI}" ]]; then export REQUESTS_CA_BUNDLE="${trusted_pem_path}" pip install pip-system-certs unset REQUESTS_CA_BUNDLE @@ -2149,14 +2149,15 @@ $(declare -f cache_fetched_package) $(declare -f execute_with_retries) # --- Define gsutil/gcloud commands and curl args --- -gsutil_cmd="gcloud storage" -gsutil_stat_cmd="gcloud storage objects describe" -gcloud_sdk_version="\$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print \$2}' || echo '0.0.0')" -if version_lt "\${gcloud_sdk_version}" "402.0.0" ; then - gsutil_cmd="gsutil -o GSUtil:check_hashes=never" - gsutil_stat_cmd="gsutil stat" +gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}' || echo '0.0.0')" +if version_lt "${gcloud_sdk_version}" "402.0.0" ; then + gsutil_cmd=("gsutil" "-o" "GSUtil:check_hashes=never") + gsutil_stat_cmd=("gsutil" "stat") +else + gsutil_cmd=("gcloud" "storage") + gsutil_stat_cmd=("gcloud" "storage" "objects" "describe") fi -curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" +curl_retry_args=("-fsSL" "--retry-connrefused" "--retry" "10" "--retry-max-time" "30") # --- Include the main config function --- $(declare -f run_hadoop_spark_config) @@ -2322,11 +2323,11 @@ function cache_fetched_package() { local gcs_fn="$2" local local_fn="$3" - if ${gsutil_stat_cmd} "${gcs_fn}" 2>&1 ; then - execute_with_retries ${gsutil_cmd} cp "${gcs_fn}" "${local_fn}" + if "${gsutil_stat_cmd[@]}" "${gcs_fn}" > /dev/null 2>&1; then + execute_with_retries "${gsutil_cmd[@]}" cp "${gcs_fn}" "${local_fn}" else - time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \ - execute_with_retries ${gsutil_cmd} cp "${local_fn}" "${gcs_fn}" ; ) + time ( curl "${curl_retry_args[@]}" "${src_url}" -o "${local_fn}" && \ + execute_with_retries "${gsutil_cmd[@]}" cp "${local_fn}" "${gcs_fn}" ; ) fi } @@ -2442,7 +2443,7 @@ function exit_handler() { # clean up incomplete build indicators if test -n "${building_file}" ; then - if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi + if "${gsutil_stat_cmd[@]}" "${building_file}" ; then "${gsutil_cmd[@]}" rm "${building_file}" || true ; fi fi set +e # Allow cleanup commands to fail without exiting script @@ -2780,17 +2781,17 @@ function prepare_to_install(){ # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` - gsutil_cmd="gcloud storage" - gsutil_stat_cmd="gcloud storage objects describe" + gsutil_cmd=("gcloud" "storage") + gsutil_stat_cmd=("gcloud" "storage" "objects" "describe") gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then - gsutil_cmd="gsutil -o GSUtil:check_hashes=never" - gsutil_stat_cmd="gsutil stat" + gsutil_cmd=("gsutil" "-o" "GSUtil:check_hashes=never") + gsutil_stat_cmd=("gsutil" "stat") fi # if fetches of nvidia packages fail, apply -k argument to the following. - curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" + curl_retry_args=("-fsSL" "--retry-connrefused" "--retry" "10" "--retry-max-time" "30") # After manually verifying the veracity of the asset, take note of sha256sum # of the downloaded files in your gcs bucket and submit these data with an From 7f7600714ae8e90b198a3208d5dc0afaa55c17be Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 03:38:15 +0000 Subject: [PATCH 03/12] feat(gpu): Update CUDA and Driver version maps to support CUDA 12.8-13.1 This change updates the version mapping arrays in the GPU installation script to include support for NVIDIA CUDA versions 12.8, 12.9, 13.0, and 13.1, along with their corresponding driver, cuDNN, and NCCL versions. - Added entries for CUDA 12.8, 12.9, 13.0, and 13.1 to `DRIVER_FOR_CUDA`, `DRIVER_SUBVER`, `CUDNN_FOR_CUDA`, `NCCL_FOR_CUDA`, and `CUDA_SUBVER` arrays. - Updated `DEFAULT_CUDA_VERSION` for Dataproc images 2.2 and 2.3 to default to 13.1.1. - Added corresponding CUDA full version to driver version mappings in the `drv_for_cuda` array in `set_cuda_runfile_url` function. --- gpu/install_gpu_driver.sh | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 6c10df5a6..b5bf03d03 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -141,6 +141,8 @@ readonly -A DRIVER_FOR_CUDA=( ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06" ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142" + ["12.8"]="570.211.01" ["12.9"]="575.64.05" + ["13.0"]="580.126.20" ["13.1"]="590.48.01" ) readonly -A DRIVER_SUBVER=( ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" @@ -150,7 +152,8 @@ readonly -A DRIVER_SUBVER=( ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03" - ["565"]="565.77" + ["565"]="565.77" ["570"]="570.211.01" ["575"]="575.64.05" + ["580"]="580.126.20" ["590"]="590.48.01" ) # https://developer.nvidia.com/cudnn-downloads readonly -A CUDNN_FOR_CUDA=( @@ -160,7 +163,8 @@ readonly -A CUDNN_FOR_CUDA=( ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18" - ["12.6"]="9.6.0.74" + ["12.6"]="9.6.0.74" ["12.8"]="9.8.0.87" ["12.9"]="9.10.2.21" + ["13.0"]="9.14.0.64" ["13.1"]="9.17.1.4" ) # https://developer.nvidia.com/nccl/nccl-download readonly -A NCCL_FOR_CUDA=( @@ -169,7 +173,8 @@ readonly -A NCCL_FOR_CUDA=( ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4" - ["12.5"]="2.22.3" ["12.6"]="2.23.4" + ["12.5"]="2.22.3" ["12.6"]="2.23.4" ["12.8"]="2.25.1" + ["12.9"]="2.27.3" ["13.0"]="2.27.7" ["13.1"]="2.29.2" ) readonly -A CUDA_SUBVER=( ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" @@ -178,16 +183,16 @@ readonly -A CUDA_SUBVER=( ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" - ["12.6"]="12.6.3" + ["12.6"]="12.6.3" ["12.8"]="12.8.1" ["12.9"]="12.9.1" + ["13.0"]="13.0.2" ["13.1"]="13.1.1" ) - function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in - "1.5" ) DEFAULT_CUDA_VERSION="11.6.2" ;; - "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) - "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;; - "2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;; + "1.5" ) local DEFAULT_CUDA_VERSION="11.6.2" ;; + "2.0" ) local DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + "2.1" ) local DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) local DEFAULT_CUDA_VERSION="13.1.1" ;; + "2.3" ) local DEFAULT_CUDA_VERSION="13.1.1" ;; * ) echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" exit 1 @@ -429,6 +434,10 @@ function set_cuda_runfile_url() { ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05" + ["12.8.0"]="570.86.10" ["12.8.1"]="570.124.06" + ["12.9.0"]="575.51.03" ["12.9.1"]="575.57.08" + ["13.0.0"]="580.65.06" ["13.0.1"]="580.82.07" ["13.0.2"]="580.95.05" + ["13.1.0"]="590.44.01" ["13.1.1"]="590.48.01" ) # Verify that the file with the indicated combination exists From 050046760c3c7b0a7dd76f095e1507b7328142e7 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 04:30:54 +0000 Subject: [PATCH 04/12] feat(gpu): Improve metadata handling for GPU driver and CUDA versions This change enhances the robustness of how `cuda-version` and `gpu-driver-version` metadata are processed in the GPU initialization scripts. - In `set_cuda_version` and `set_driver_version` functions: - Metadata is now fetched without a default value initially. - The script checks if the metadata value is non-empty before using it. - If the metadata is empty or not provided, it falls back to the determined default version. - Added validation steps to ensure the final version string matches the expected format (at least `X.Y`). - Included DEBUG messages to log whether the version was sourced from metadata or the default. --- gpu/install_gpu_driver.sh | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b5bf03d03..cefa8ef00 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -210,7 +210,27 @@ function set_cuda_version() { fi readonly DEFAULT_CUDA_VERSION - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + local raw_cuda_version + raw_cuda_version=$(get_metadata_attribute 'cuda-version' '') # Get raw value, default to empty + + if [[ -n "${raw_cuda_version}" ]]; then + # Use metadata value only if it's not empty + CUDA_VERSION="${raw_cuda_version}" + echo "DEBUG: Using cuda-version from metadata: '${CUDA_VERSION}'" + else + # Fallback to DEFAULT_CUDA_VERSION if metadata is empty or not found + CUDA_VERSION="${DEFAULT_CUDA_VERSION}" + echo "DEBUG: cuda-version metadata not found or empty, using default: '${CUDA_VERSION}'" + fi + + # Validate the chosen CUDA_VERSION + if ! test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+/')" ; then + echo "ERROR: Invalid CUDA_VERSION obtained: '${CUDA_VERSION}'. Attempting to use DEFAULT: '${DEFAULT_CUDA_VERSION}'" >&2 + CUDA_VERSION="${DEFAULT_CUDA_VERSION}" + fi + + echo "DEBUG: Effective CUDA_VERSION: '${CUDA_VERSION}'" + if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then CUDA_FULL_VERSION="${CUDA_VERSION}" CUDA_VERSION="${CUDA_VERSION%.*}" @@ -265,8 +285,23 @@ function set_driver_version() { DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} fi - DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") + local raw_driver_version + raw_driver_version=$(get_metadata_attribute 'gpu-driver-version' '') + + if [[ -n "${raw_driver_version}" ]]; then + DRIVER_VERSION="${raw_driver_version}" + echo "DEBUG: Using gpu-driver-version from metadata: '${DRIVER_VERSION}'" + else + DRIVER_VERSION="${DEFAULT_DRIVER}" + echo "DEBUG: gpu-driver-version metadata not found or empty, using default: '${DRIVER_VERSION}'" + fi + + if ! test -n "$(echo "${DRIVER_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then + echo "ERROR: Invalid DRIVER_VERSION obtained: '${DRIVER_VERSION}'. Attempting to use DEFAULT: '${DEFAULT_DRIVER}'" >&2 + DRIVER_VERSION="${DEFAULT_DRIVER}" + fi + echo "DEBUG: Effective DRIVER_VERSION: '${DRIVER_VERSION}'" readonly DRIVER_VERSION readonly DRIVER="${DRIVER_VERSION%%.*}" From 652cccf6e36ba92c73b48a9fe8b515646d41b0dd Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 14:19:06 +0000 Subject: [PATCH 05/12] feat(gpu): Enhance URL checks and add GCS caching for CUDA runfiles This change improves the robustness of the GPU driver installation script by: 1. Standardizing URL existence checks to use `curl --head` with retry arguments (`${curl_retry_args[@]}`) instead of `curl -sSLfI` for better consistency and error handling. 2. Implementing GCS caching for the CUDA runfile in `set_cuda_runfile_url`. The script now checks a pre-defined GCS bucket (`${pkg_bucket}`) for an existing copy of the required CUDA `.run` file. If found, it downloads from the cache. Otherwise, it downloads from the official NVIDIA URL and uploads a copy to the GCS bucket for future use. This speeds up subsequent runs and reduces reliance on external network availability. 3. The driver runfile caching logic in `set_driver_version` was already present but this change ensures the URL check uses the standard `${curl_retry_args[@]}`. These changes make the script more resilient to transient network issues and more efficient in environments where the same files might be needed multiple times across different cluster builds. --- gpu/install_gpu_driver.sh | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index cefa8ef00..47b7af980 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -319,7 +319,7 @@ function set_driver_version() { if ! gsutil -q stat "${gcs_cache_path}"; then echo "Driver not found in GCS cache. Validating URL: ${gpu_driver_url}" # Use curl to check if the URL is valid (HEAD request) - if curl -sSLfI --connect-timeout 10 --max-time 30 "${gpu_driver_url}" 2>/dev/null | grep -E -q 'HTTP.*200'; then + if curl "${curl_retry_args[@]}" --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200'; then echo "NVIDIA URL is valid. Downloading to cache..." local temp_driver_file="${tmpdir}/${driver_filename}" @@ -495,6 +495,31 @@ function set_cuda_runfile_url() { CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" readonly CUDA_RUNFILE + export local_cuda_runfile="${tmpdir}/${CUDA_RUNFILE}" + local gcs_cache_path="${pkg_bucket}/nvidia/${CUDA_RUNFILE}" + + echo "Checking for cached CUDA runfile at: ${gcs_cache_path}" + if "${gsutil_stat_cmd[@]}" "${gcs_cache_path}" > /dev/null 2>&1; then + echo "CUDA runfile found in GCS cache. Downloading from ${gcs_cache_path}" + if ! "${gsutil_cmd[@]}" cp "${gcs_cache_path}" "${local_cuda_runfile}"; then + echo "ERROR: Failed to download CUDA runfile from GCS cache." + exit 1 + fi + else + echo "CUDA runfile not found in GCS cache. Downloading from NVIDIA: ${NVIDIA_CUDA_URL}" + # URL validity was already checked above + echo "Downloading from ${NVIDIA_CUDA_URL} to ${local_cuda_runfile}" + if curl "${curl_retry_args[@]}" -o "${local_cuda_runfile}" "${NVIDIA_CUDA_URL}"; then + echo "Download complete. Uploading to GCS cache: ${gcs_cache_path}" + if ! "${gsutil_cmd[@]}" cp "${local_cuda_runfile}" "${gcs_cache_path}"; then + echo "WARN: Failed to upload CUDA runfile to GCS cache." + fi + else + echo "ERROR: Failed to download CUDA runfile from NVIDIA." + exit 1 + fi + fi + echo "DEBUG: Local CUDA runfile path: ${local_cuda_runfile}" if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" @@ -2080,6 +2105,7 @@ readonly HADOOP_CONF_DIR='/etc/hadoop/conf' readonly SPARK_CONF_DIR='/etc/spark/conf' readonly bdcfg="/usr/local/bin/bdconfig" readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package +readonly tmpdir="${tmpdir}" # --- Define Necessary Global Arrays --- # These need to be explicitly defined here as they are not functions. From 926a5b3cf6413a4b7970c8ccd29b5e8cdbaacc69 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 19:12:03 +0000 Subject: [PATCH 06/12] feat: Refactor Conda environment creation and add GPU packages This commit introduces a major refactoring of how Conda environments and GPU-accelerated libraries like TensorFlow, PyTorch, and Rapids are installed. Key Changes: 1. **Isolated Conda Environments:** * Introduced a new `create_conda_env` function to build and cache isolated Conda environments in GCS. This function includes logic to prevent race conditions during concurrent builds using a `.building` sentinel file. * Replaced the previous monolithic environment approach. 2. **TensorFlow Installation:** * The `install_tensorflow` function now utilizes `create_conda_env` to set up a dedicated "tensorflow" environment. 3. **PyTorch & Rapids Installation:** * The `install_pytorch` function has been rewritten to use `create_conda_env` to create two separate environments: * `pytorch`: For PyTorch and related packages. * `rapids`: For the Rapids AI ecosystem. * The metadata flag `include-pytorch=yes` now controls the installation of both these environments. 4. **NCCL Install Fix:** * Corrected the `curl` command in `install_nvidia_nccl` to properly expand the `curl_retry_args` array using `"${curl_retry_args[@]}"`. 5. **Deferred Config Fix:** * Added `readonly install_log="${tmpdir}/install.log"` to the script generated by `create_deferred_config_files` to resolve an unbound variable issue. 6. **Integration Test Updates (`test_gpu.py`):** * Updated environment paths from `/opt/conda/miniconda3/envs/` to `/opt/conda/default/envs/`. * Added `include-tensorflow=true` and `include-pytorch=yes` to the metadata for relevant tests to ensure the new environments are created during test cluster setup. These changes provide better isolation for GPU-accelerated libraries, improve cache management, and enhance the robustness of the GPU initialization action. --- gpu/install_gpu_driver.sh | 342 ++++++++++++++++++++++++++++---------- gpu/test_gpu.py | 6 +- 2 files changed, 260 insertions(+), 88 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 47b7af980..e311042a5 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -658,6 +658,203 @@ function install_local_cudnn_repo() { mark_complete install-local-cudnn-repo } +function create_conda_env() { + local env_name="$1" + shift + local packages=("$@") + + local conda_root_path="/opt/conda/default" + [[ -d ${conda_root_path} ]] || return 1 + local envpath="${conda_root_path}/envs/${env_name}" + + # Set numa node to 0 for all GPUs + for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > "${f}" || true ; done + + local build_tarball="${env_name}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + + if is_complete "install_env_${env_name}"; then + echo "Environment '${env_name}' sentinel found, skipping creation." + # Still register kernel if not already done + if ! [[ -d "/usr/local/share/jupyter/kernels/${env_name}" ]]; then + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + fi + return 0 + fi + + echo "Creating Conda environment: ${env_name}" + + set +e + "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1 + local cache_exists_code=$? + set -e + + if [[ ${cache_exists_code} -eq 0 ]]; then + echo "Cache hit for ${env_name}. Unpacking from ${gcs_tarball}" + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory: ${envpath}" + rm -rf "${envpath}" + fi + mkdir -p "${envpath}" + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + echo "Cache miss for ${env_name}. Building environment." + + # Wait for any other node to finish building this same tarball + if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + sleep $(( ( RANDOM % 11 ) + 10 )) + fi + # Check for the .building file + local building_output + set +e # Don't exit if describe fails + building_output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" 2>/dev/null)" + local gcs_describe_exit_code=$? + set -e + if [[ ${gcs_describe_exit_code} -eq 0 ]] && [[ -n "${building_output}" ]]; then + local build_start_time + build_start_time=$(echo "${building_output}" | grep -oP 'Creation time:\s*\K.*' || echo "") + if [[ -n "${build_start_time}" ]]; then + local build_start_epoch + build_start_epoch="$(date -u -d "${build_start_time}" +%s)" + local timeout_epoch + timeout_epoch=$((build_start_epoch + 3600)) # 60 minutes + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" > /dev/null 2>&1 ; do + # Check if the main tarball has appeared in the meantime + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1; then + echo "INFO: Cache file ${gcs_tarball} appeared while waiting. Skipping build." + break # Exit while loop, will be caught by the next check + fi + local now_epoch + now_epoch="$(date -u +%s)" + if (( now_epoch > timeout_epoch )) ; then + echo "WARN: Timeout waiting for ${gcs_tarball}.building to be removed. Removing it myself." + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" + break + fi + echo "INFO: Waiting for existing build of ${gcs_tarball} to complete..." + sleep 1m # Shorter sleep for faster detection + done + fi + fi + + # Re-check if the tarball was created while we were waiting + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1 ; then + echo "Cache hit for ${env_name}. Unpacking from ${gcs_tarball}" + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory: ${envpath}" + rm -rf "${envpath}" + fi + mkdir -p "${envpath}" + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz + # Skip the rest of the build, go directly to jupyter kernel registration + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m pip install ipykernel + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + mark_complete "install_env_${env_name}" + return 0 + fi + + echo "INFO: Proceeding to build ${env_name}." + # Clean up any previous partial build attempt (if timeout occurred) + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || echo "WARN: No .building file to remove." + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory for rebuild: ${envpath}" + rm -rf "${envpath}" + fi + + touch "${local_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" + building_file="${gcs_tarball}.building" + + local conda_path="${conda_root_path}/bin/mamba" + if ! command -v "${conda_path}" > /dev/null 2>&1; then + echo "Mamba not found, installing..." + "${conda_root_path}/bin/conda" install -n base -c conda-forge mamba -y \ + || echo "WARN: Mamba installation failed." + if ! command -v "${conda_path}" > /dev/null 2>&1; then + echo "Mamba not found, falling back to conda." + conda_path="${conda_root_path}/bin/conda" + fi + fi + echo "Using installer: ${conda_path}" + + local conda_err_file="${tmpdir}/conda_create_${env_name}.err" + echo "DEBUG: About to run ${conda_path} create for ${env_name}" + set +e + "${conda_path}" create -y -n "${env_name}" "${packages[@]}" > "${conda_err_file}" 2>&1 + local conda_exit_code=$? + set -e + echo "DEBUG: ${conda_path} create finished with exit code ${conda_exit_code}" + + if [[ "${conda_exit_code}" -ne 0 ]]; then + cat "${conda_err_file}" >&2 + if [[ "${conda_path}" == *mamba ]] && grep -q "RuntimeError: Multi-download failed." "${conda_err_file}"; then + echo "ERROR: Mamba failed to create the environment, likely due to a proxy issue on this platform." >&2 + echo "ERROR: Please run this initialization action in a non-proxied environment at least once to build and populate the GCS cache for '${gcs_tarball}'." >&2 + echo "ERROR: Once the cache exists, subsequent runs in the proxied environment should succeed." >&2 + exit 1 + else + echo "ERROR: Conda/Mamba environment creation failed with exit code ${conda_exit_code}." >&2 + exit "${conda_exit_code}" + fi + fi + rm -f "${conda_err_file}" + + # Activate environment for any pip installs + echo "Activating ${env_name} environment..." + source "${conda_root_path}/etc/profile.d/conda.sh" + set +u # Temporarily disable unbound variable check + conda activate "${env_name}" + set -u # Re-enable unbound variable check + echo "Activated $(which python)" + + if [[ "${env_name}" == "tensorflow" ]]; then + echo "Installing TensorFlow with GPU support using pip in '${env_name}' env..." + python -m pip install --upgrade pip + python -m pip install --no-cache-dir 'tensorflow[and-cuda]>=2.16.0,<2.17.0' + + # Verify TensorFlow GPU + echo "DEBUG: Verifying TensorFlow GPU inside init action..." + python <<-'EOF' +import tensorflow as tf +print(f"TF Version: {tf.__version__}") +print(f"GPU Available: {tf.config.list_physical_devices('GPU')}") +print(f"Build Info: {tf.sysconfig.get_build_info()}") +gpus = tf.config.list_physical_devices('GPU') +if not gpus: + print("ERROR: TensorFlow cannot detect GPU!") + exit(1) +print(f"TensorFlow GPU check passed: {gpus}") +EOF + if [[ $? -ne 0 ]]; then + echo "ERROR: TensorFlow GPU verification failed in ${env_name} environment." + exit 1 + fi + echo "DEBUG: TensorFlow verification done." + fi + + conda deactivate + + echo "Packaging environment '${env_name}'" + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if [[ -n "${building_file:-}" ]]; then + "${gsutil_cmd[@]}" rm "${building_file}" || true + building_file="" + fi + rm -f "${local_tarball}" + echo "Environment '${env_name}' built and cached." + fi + + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m pip install ipykernel + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + mark_complete "install_env_${env_name}" +} function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" mark_incomplete install-local-cudnn-repo @@ -700,7 +897,60 @@ function install_local_cudnn8_repo() { cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings mark_complete install-local-cudnn8-repo } +function install_tensorflow() { + include_tensorflow="$(get_metadata_attribute 'include-tensorflow' 'false')" + echo "DEBUG: include-tensorflow metadata value: [${include_tensorflow}]" + if [[ "${include_tensorflow^^}" != "TRUE" && "${include_tensorflow^^}" != "YES" && "${include_tensorflow}" != "1" ]]; then + echo "Skipping TensorFlow installation." + return 0 + fi + is_complete install_env_tensorflow && return + + local channels=('-c' 'conda-forge') + local packages=( + "python=3.11" "pyspark" "pandas" "numba" "pyarrow" + ) + create_conda_env "tensorflow" "${channels[@]}" "${packages[@]}" +} +function install_pytorch() { + include_pytorch="$(get_metadata_attribute 'include-pytorch' 'false')" + echo "DEBUG: 062: include-pytorch metadata value: [${include_pytorch}]" + if [[ "${include_pytorch^^}" != "TRUE" && "${include_pytorch^^}" != "YES" && "${include_pytorch}" != "1" ]]; then + echo "DEBUG: 062: Skipping PyTorch/Rapids installation." + return 0 + fi + echo "DEBUG: 062: Passed include-pytorch check" + + # Create isolated PyTorch environment + if ! is_complete install_env_pytorch; then + echo "DEBUG: 062: About to create pytorch env" + local channels=('-c' 'pytorch' '-c' 'nvidia' '-c' 'conda-forge') + local pt_packages=( + "python=3.11" "pytorch" "torchvision" "torchaudio" "pyspark" "numba" + ) + create_conda_env "pytorch" "${channels[@]}" "${pt_packages[@]}" + echo "DEBUG: 062: create_conda_env pytorch finished with exit code $?" + else + echo "DEBUG: 062: pytorch sentinel found, skipping creation" + fi + + echo "DEBUG: 062: After pytorch env block" + + # Create isolated Rapids environment + if ! is_complete install_env_rapids; then + echo "DEBUG: 062: About to create rapids env" + local channels=('-c' 'rapidsai' '-c' 'nvidia' '-c' 'conda-forge') + local rapids_packages=( + "python=3.11" "rapids" "pyspark" "numba" + ) + create_conda_env "rapids" "${channels[@]}" "${rapids_packages[@]}" + echo "DEBUG: 062: create_conda_env rapids finished with exit code $?" + else + echo "DEBUG: 062: rapids sentinel found, skipping creation" + fi + echo "DEBUG: 062: End of install_pytorch function" +} function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" mark_incomplete install-local-cudnn8-repo @@ -724,7 +974,7 @@ function install_nvidia_nccl() { test -d "${workdir}/nccl" || { local tarball_fn="v${NCCL_VERSION}-1.tar.gz" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ | tar xz mv "nccl-${NCCL_VERSION}-1" nccl @@ -905,87 +1155,6 @@ function install_nvidia_cudnn() { mark_complete cudnn } -function install_pytorch() { - is_complete pytorch && return - - local env - env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') - - local conda_root_path - if version_lt "${DATAPROC_IMAGE_VERSION}" "2.3" ; then - conda_root_path="/opt/conda/miniconda3" - else - conda_root_path="/opt/conda" - fi - [[ -d ${conda_root_path} ]] || return - local envpath="${conda_root_path}/envs/${env}" - if [[ "${env}" == "base" ]]; then - echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${conda_root_path}" ; fi - # Set numa node to 0 for all GPUs - for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done - - local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" - - if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then - # when running with fewer than 32 cores, yield to in-progress build - sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" - if [[ "$?" == "0" ]] ; then - local build_start_time build_start_epoch timeout_epoch - build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" - build_start_epoch="$(date -u -d "${build_start_time}" +%s)" - timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do - local now_epoch="$(date -u +%s)" - if (( now_epoch > timeout_epoch )) ; then - # detect unexpected build failure after 45m - "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" - break - fi - sleep 5m - done - fi - fi - - if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - mkdir -p "${envpath}" - "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz - else - touch "${local_tarball}.building" - "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" - building_file="${gcs_tarball}.building" - local verb=create - if test -d "${envpath}" ; then verb=install ; fi - cudart_spec="cuda-cudart" - if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi - - # Install pytorch and company to this environment - "${conda_root_path}/bin/mamba" "${verb}" -n "${env}" \ - -c conda-forge -c nvidia -c rapidsai \ - numba pytorch tensorflow[and-cuda] rapids pyspark \ - "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" - - # Install jupyter kernel in this environment - "${envpath}/bin/python3" -m pip install ipykernel - - # package environment and cache in GCS - pushd "${envpath}" - tar czf "${local_tarball}" . - popd - "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" - if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi - building_file="" - fi - - # register the environment as a selectable kernel - "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})" - - mark_complete pytorch -} function configure_dkms_certs() { if test -v PSN && [[ -z "${PSN}" ]]; then @@ -2106,6 +2275,7 @@ readonly SPARK_CONF_DIR='/etc/spark/conf' readonly bdcfg="/usr/local/bin/bdconfig" readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package readonly tmpdir="${tmpdir}" +readonly install_log="${tmpdir}/install.log" # --- Define Necessary Global Arrays --- # These need to be explicitly defined here as they are not functions. @@ -2310,13 +2480,15 @@ function main() { install_nvidia_nccl install_nvidia_cudnn fi - case "${INCLUDE_PYTORCH^^}" in - "1" | "YES" | "TRUE" ) install_pytorch ;; - esac + + install_tensorflow + install_pytorch #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + echo "DEBUG: About to call install_gpu_agent" #install_ops_agent install_gpu_agent + echo "DEBUG: Finished install_gpu_agent call. Exit code: $?" echo 'GPU metrics agent successfully deployed.' else echo 'GPU metrics agent will not be installed.' diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index db64083da..f1fda23ef 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -71,7 +71,7 @@ def verify_pytorch(self, name): # executed improves readability of the diagnostic information. verify_cmd = \ - "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ + "env={} ; envpath=/opt/conda/default/envs/${env} ; ".format(conda_env) + \ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ "${envpath}/bin/python {}".format( self.TORCH_TEST_SCRIPT_FILE_NAME) @@ -85,7 +85,7 @@ def verify_tensorflow(self, name): # all on a single numa node conda_env="dpgce" verify_cmd = \ - "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ + "env={} ; envpath=/opt/conda/default/envs/${env} ; ".format(conda_env) + \ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ "${envpath}/bin/python {}".format( self.TF_TEST_SCRIPT_FILE_NAME) @@ -397,7 +397,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') self.skipTest("known to fail") - metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) + metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={},include-tensorflow=true,include-pytorch=yes".format(cuda_version) self.createCluster( configuration, self.INIT_ACTIONS, From 72e596566ba996c979742f8e8476ecea86f61948 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 15 May 2026 20:37:45 +0000 Subject: [PATCH 07/12] Fix: Correct variable quoting and array expansions This commit addresses several quoting and array expansion issues throughout the GPU initialization scripts to improve robustness and prevent word splitting problems. **Key Changes:** * **Array Expansions:** Correctly expanded arrays using `"${array[@]}"` for `curl_retry_args` and `GPG_PROXY_ARGS` in functions like `add_repo_nvidia_container_toolkit`, `add_repo_cuda`, `build_driver_from_github`, and `dnf_add_repo`. * **Variable Quoting:** Ensured various variables are properly double-quoted or single-quoted within strings, particularly in `echo` commands and command arguments. This includes variables like `tmpfile`, `DIST_KEYRING_DIR`, `module`, `kr_path`, `repo_data`, and proxy variables. * **Conda/Mamba Fallback:** Added logic to `create_conda_env` to fallback to using `conda` instead of `mamba` on older distributions (Debian 10, Ubuntu 18) where mamba might have download issues. --- gpu/install_gpu_driver.sh | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index e311042a5..9f9b94976 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -78,7 +78,7 @@ function repair_old_backports { function print_metadata_value() { local readonly tmpfile=$(mktemp) http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ - -s -o ${tmpfile} 2>/dev/null) + -s -o "${tmpfile}" 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. if [[ "${return_code}" == 0 && "${http_code}" == 200 ]]; then @@ -623,7 +623,7 @@ function install_local_cuda_repo() { dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" - cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + cp "${DIST_KEYRING_DIR}"/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then curl "${curl_retry_args[@]}" \ @@ -778,6 +778,12 @@ function create_conda_env() { conda_path="${conda_root_path}/bin/conda" fi fi + + # Fallback to conda for older OSes due to download issues with mamba + if is_debian10 || is_ubuntu18; then + echo "INFO: Older OS detected, using conda instead of mamba for environment ${env_name}" + conda_path="${conda_root_path}/bin/conda" + fi echo "Using installer: ${conda_path}" local conda_err_file="${tmpdir}/conda_create_${env_name}.err" @@ -1281,13 +1287,13 @@ function add_repo_nvidia_container_toolkit() { GPG_PROXY="--keyserver-options http-proxy=${http_proxy}" fi execute_with_retries gpg --keyserver keyserver.ubuntu.com \ - ${GPG_PROXY_ARGS} \ + "${GPG_PROXY_ARGS[@]}" \ --no-default-keyring --keyring "${kr_path}" \ --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0" local -r repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" local -r repo_path="/etc/apt/sources.list.d/${repo_name}.list" echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + echo "deb-src [signed-by=\"${kr_path}\"] \"${repo_data}\"" >> "${repo_path}" execute_with_retries apt-get update else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" @@ -1310,9 +1316,9 @@ function add_repo_cuda() { if [[ -n "${HTTP_PROXY}" ]] ; then GPG_PROXY="--keyserver-options http-proxy=${HTTP_PROXY}" elif [[ -n "${http_proxy}" ]] ; then - GPG_PROXY="--keyserver-options http-proxy=${http_proxy}" + GPG_PROXY="--keyserver-options http-proxy=\"${http_proxy}\"" fi - execute_with_retries gpg --keyserver keyserver.ubuntu.com ${GPG_PROXY_ARGS} \ + execute_with_retries gpg --keyserver keyserver.ubuntu.com "${GPG_PROXY_ARGS[@]}" \ --no-default-keyring --keyring "${kr_path}" \ --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" else @@ -1333,7 +1339,7 @@ function build_driver_from_github() { pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { tarball_fn="${DRIVER_VERSION}.tar.gz" - execute_with_retries curl ${curl_retry_args} \ + execute_with_retries curl "${curl_retry_args[@]}" \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ \| tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules @@ -2496,7 +2502,7 @@ function main() { # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + rmmod "${module}" > /dev/null 2>&1 || echo "unable to rmmod \"${module}\"" done if test -n "$(nvsmi -L)" ; then @@ -3162,7 +3168,7 @@ function apt_add_repo() { echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" if [[ "${include_src}" == "yes" ]] ; then - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + echo "deb-src [signed-by='${kr_path}'] ${repo_data}" >> "${repo_path}" fi apt-get update -qq @@ -3177,7 +3183,7 @@ function dnf_add_repo() { local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - curl ${curl_retry_args} "${repo_url}" \ + curl "${curl_retry_args[@]}" "${repo_url}" \ | dd of="${repo_path}" status=progress } From b1774c5b6ff3f36326edd1cbd032ef7596b61775 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 15 May 2026 21:31:13 +0000 Subject: [PATCH 08/12] Refactor: Introduce import_gpg_keys function for robust key management This commit introduces a new helper function, `import_gpg_keys`, to standardize the process of fetching and importing GPG keys for repository setup. This function handles keys provided as URLs or keyserver IDs, ensures keys are dearmored, and includes proxy support and retries. **Changes:** * **New Function:** Added `import_gpg_keys` to `080_import_gpg_keys.sh`. * **Refactored `add_repo_nvidia_container_toolkit`:** Now uses `import_gpg_keys` to fetch necessary keys. * **Refactored `add_repo_cuda`:** Now uses `import_gpg_keys` for older CUDA version key fetching. * **Refactored `clean_up_sources_lists`:** The MySQL key import is updated to use `import_gpg_keys` with the key ID, fixing a bug where it previously attempted to dearmor HTML output. This change improves the reliability and consistency of GPG key handling within the GPU initialization script. --- gpu/install_gpu_driver.sh | 62 +++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 9f9b94976..ab47f95e6 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1266,6 +1266,56 @@ function add_nonfree_components() { sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list fi } +function import_gpg_keys() { + local keyring_path="$1" + shift + local keys=("$@") + + mkdir -p "$(dirname "${keyring_path}")" + + local GPG_PROXY_ARGS=() + if [[ -n "${HTTP_PROXY:-}" ]]; then + GPG_PROXY_ARGS=(--keyserver-options "http-proxy=${HTTP_PROXY}") + elif [[ -n "${http_proxy:-}" ]]; then + GPG_PROXY_ARGS=(--keyserver-options "http-proxy=${http_proxy}") + fi + + local tmp_keyring + tmp_keyring=$(mktemp) + local keyserver_keys_found=0 + + for key in "${keys[@]}"; do + echo "DEBUG: Importing GPG key: ${key} into ${keyring_path}" + if [[ "${key}" =~ ^https?:// ]]; then + # Import dearmored key from URL, overwrites keyring_path + if ! execute_with_retries curl "${curl_retry_args[@]}" "${key}" | gpg --dearmor --yes -o "${keyring_path}"; then + echo "ERROR: Failed to import GPG key from URL: ${key}" + rm -f "${tmp_keyring}" + exit 1 + fi + elif [[ "${key}" =~ ^0x ]]; then + # Fetch key from keyserver into tmp_keyring + keyserver_keys_found=1 + if ! execute_with_retries gpg --keyserver keyserver.ubuntu.com "${GPG_PROXY_ARGS[@]}" --no-default-keyring --keyring "${tmp_keyring}" --recv-keys "${key}"; then + echo "ERROR: Failed to receive GPG key from keyserver: ${key}" + rm -f "${tmp_keyring}" + exit 1 + fi + else + echo "WARN: Unrecognized key format, skipping: ${key}" + fi + done + + # If any keys were fetched from keyserver, export and dearmor them all into the final keyring + if [[ "${keyserver_keys_found}" -eq 1 ]]; then + if ! gpg --no-default-keyring --keyring "${tmp_keyring}" --export | gpg --dearmor --yes -o "${keyring_path}"; then + echo "ERROR: Failed to export/dearmor GPG keys from temporary keyring" + rm -f "${tmp_keyring}" + exit 1 + fi + fi + rm -f "${tmp_keyring}" +} # # Install package signing key and add corresponding repository @@ -1286,10 +1336,7 @@ function add_repo_nvidia_container_toolkit() { elif [[ -v http_proxy ]] ; then GPG_PROXY="--keyserver-options http-proxy=${http_proxy}" fi - execute_with_retries gpg --keyserver keyserver.ubuntu.com \ - "${GPG_PROXY_ARGS[@]}" \ - --no-default-keyring --keyring "${kr_path}" \ - --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0" + import_gpg_keys "${kr_path}" "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0" local -r repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" local -r repo_path="/etc/apt/sources.list.d/${repo_name}.list" echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" @@ -1318,9 +1365,7 @@ function add_repo_cuda() { elif [[ -n "${http_proxy}" ]] ; then GPG_PROXY="--keyserver-options http-proxy=\"${http_proxy}\"" fi - execute_with_retries gpg --keyserver keyserver.ubuntu.com "${GPG_PROXY_ARGS[@]}" \ - --no-default-keyring --keyring "${kr_path}" \ - --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" + import_gpg_keys "${kr_path}" "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" else install_cuda_keyring_pkg # 11.7+, 12.0+ fi @@ -2676,8 +2721,7 @@ function clean_up_sources_lists() { # if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then rm -f /usr/share/keyrings/mysql.gpg - curl ${curl_retry_args} 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ - gpg --dearmor -o /usr/share/keyrings/mysql.gpg + import_gpg_keys "/usr/share/keyrings/mysql.gpg" "0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C" sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi From f6a96f42af76fe62a0e585dff2a413dda6708698 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 15 May 2026 22:38:27 +0000 Subject: [PATCH 09/12] Feat: Centralized network evaluation and caching This commit introduces a network evaluation function to centralize connectivity checks and cache the results, making subsequent network-dependent operations more efficient and robust. **Key Changes:** * **New `evaluate_network` Function:** * Created `126_evaluate_network.sh` with the `evaluate_network` function. * This function probes network configuration and connectivity: * Fetches metadata for proxy and egress settings. * Checks for external IP, IPv4/IPv6 default routes. * Tests DNS (A/AAAA), ping, and HTTP connectivity to `www.gstatic.com`. * Tests HTTP connectivity to `us.download.nvidia.com`. * Saves results to `${tmpdir}/network_state.json`. * **Early Execution:** * The `evaluate_network` function is now called at the beginning of `prepare_to_install` to ensure network state is known early. * `tmpdir` initialization and `mount_ramdisk` are also moved to the start of `prepare_to_install`, and `tmpdir` and `install_log` are exported to be available in other functions. * **Dependency Addition:** Added `jq` and `dnsutils` to `install_dependencies` to support the new function. * **Refactored Network Check:** * Modified `151_set_proxy.sh` to use the cached network state from `${tmpdir}/network_state.json` via `jq` to check for `https://google.com` reachability, instead of performing a direct `curl` call. * **Bug Fix:** Removed an extraneous closing brace in `151_set_proxy.sh` that was causing a syntax error. --- gpu/install_gpu_driver.sh | 119 ++++++++++++++++++++++++++++++++++---- 1 file changed, 109 insertions(+), 10 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index ab47f95e6..d9da985c0 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -2114,6 +2114,98 @@ function is_complete() { phase="$1" test -f "${workdir}/complete/${phase}" } +function evaluate_network() { + local state_file="${tmpdir}/network_state.json" + echo "INFO: Evaluating network and writing state to ${state_file}" + + # Metadata checks + local http_proxy=$(get_metadata_attribute 'http-proxy' 'null') + if [[ "${http_proxy}" != "null" ]]; then http_proxy=""${http_proxy}""; fi + local swp_egress=$(get_metadata_attribute 'swp-egress' 'false') + + local instance_ips=$(hostname -I || echo "") + local has_external_ip="false" + # Crude check for non-internal IP + if [[ "${instance_ips}" =~ [^10\.|^172\.(1[6-9]|2[0-9]|3[0-1])\.|^192\.168] ]]; then + has_external_ip="true" + fi + + # Kernel Route Table + local default_route_v4="null" + local default_route_v6="null" + if ip -4 route show default | grep -q default; then + default_route_v4=""$(ip -4 route show default)"" + fi + if ip -6 route show default | grep -q default; then + default_route_v6=""$(ip -6 route show default)"" + fi + + # DNS & Connectivity Tests + local target_host="www.gstatic.com" + local dns_v4_ips=($(dig +short A "${target_host}" || true)) + local dns_v6_ips=($(dig +short AAAA "${target_host}" || true)) + + local dns_v4_ok="false"; [[ ${#dns_v4_ips[@]} -gt 0 ]] && dns_v4_ok="true" + local dns_v6_ok="false"; [[ ${#dns_v6_ips[@]} -gt 0 ]] && dns_v6_ok="true" + + local ping_v4_ok="false" + if [[ "${dns_v4_ok}" == "true" ]]; then + if ping -c 1 "${dns_v4_ips[0]}" >/dev/null 2>&1; then ping_v4_ok="true"; fi + fi + + local ping_v6_ok="false" + if [[ "${dns_v6_ok}" == "true" ]]; then + if ping -6 -c 1 "${dns_v6_ips[0]}" >/dev/null 2>&1; then ping_v6_ok="true"; fi + fi + + local curl_target="http://${target_host}/generate_204" + local curl_v4_ok="false" + if curl -4 -s -m 10 --head "${curl_target}" >/dev/null 2>&1; then + curl_v4_ok="true" + fi + + local curl_v6_ok="false" + if curl -6 -s -m 10 --head "${curl_target}" >/dev/null 2>&1; then + curl_v6_ok="true" + fi + + # More general checks + local nvidia_http_ok="false" + if curl -s -m 10 --head "https://us.download.nvidia.com" >/dev/null 2>&1; then + nvidia_http_ok="true" + fi + + # Assemble JSON + cat << EOF > "${state_file}" +{ + "config": { + "has_external_ip": ${has_external_ip}, + "http_proxy": ${http_proxy}, + "swp_egress": ${swp_egress} + }, + "routing": { + "default_route_v4": ${default_route_v4}, + "default_route_v6": ${default_route_v6} + }, + "gstatic": { + "dns_v4_ok": ${dns_v4_ok}, + "dns_v4_ips": [$(printf '"%s",' "${dns_v4_ips[@]}" | sed 's/,$//')], + "ping_v4_ok": ${ping_v4_ok}, + "curl_v4_ok": ${curl_v4_ok}, + "dns_v6_ok": ${dns_v6_ok}, + "dns_v6_ips": [$(printf '"%s",' "${dns_v6_ips[@]}" | sed 's/,$//')], + "ping_v6_ok": ${ping_v6_ok}, + "curl_v6_ok": ${curl_v6_ok} + }, + "http_checks": { + "https://us.download.nvidia.com": ${nvidia_http_ok} + } +} +EOF + + echo "INFO: Network state evaluation complete." + cat "${state_file}" # For debugging +} function mark_complete() { phase="$1" @@ -2128,7 +2220,7 @@ function mark_incomplete() { function install_dependencies() { is_complete install-dependencies && return 0 - pkg_list="screen" + pkg_list="screen jq dnsutils" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi mark_complete install-dependencies @@ -2965,11 +3057,13 @@ EOF echo "${output}" exit 1 } - output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://google.com" 2>&1)|| { - echo "curl rejects proxy configuration" - echo "${curl_output}" + local state_file="${tmpdir}/network_state.json" + if [[ $(jq -r '.http["https://google.com"]' "${state_file}") == "true" ]]; then + echo "DEBUG: Confirmed google.com is reachable from network state cache." + else + echo "ERROR: google.com is not reachable according to network state cache." exit 1 - } + fi output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run" 2>&1)|| { echo "curl rejects proxy configuration" echo "${output}" @@ -3054,6 +3148,16 @@ function harden_sshd_config() { } function prepare_to_install(){ + # Setup temporary directories (potentially on RAM disk) + tmpdir=/tmp/ # Default + mount_ramdisk # Updates tmpdir if successful + export tmpdir + install_log="${tmpdir}/install.log" # Set install log path based on final tmpdir + export install_log + + # Evaluate network and cache results *before* any network operations + evaluate_network + readonly uname_r=$(uname -r) # Verify OS compatability and Secure boot state check_os @@ -3104,11 +3208,6 @@ function prepare_to_install(){ # ["NVIDIA-Linux-x86_64-550.135.run"]="a8c3ae0076f11e864745fac74bfdb01f" # ["NVIDIA-Linux-x86_64-550.142.run"]="e507e578ecf10b01a08e5424dddb25b8" - # Setup temporary directories (potentially on RAM disk) - tmpdir=/tmp/ # Default - mount_ramdisk # Updates tmpdir if successful - install_log="${tmpdir}/install.log" # Set install log path based on final tmpdir - workdir=/opt/install-dpgce # Set GCS bucket for caching temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" From 850d845b0041fbbe7faf95df756f617e0df80a37 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 15 May 2026 23:33:54 +0000 Subject: [PATCH 10/12] Update GPU integration tests to verify isolated Conda environments This commit updates `test_gpu.py` to correctly test the isolated Conda environments (`pytorch`, `tensorflow`, and `rapids`) created when the corresponding metadata flags (`include-pytorch=yes`, `include-tensorflow=true`) are provided. Specific changes: * Modified `verify_pytorch` and `verify_tensorflow` to source `conda.sh` and activate their respective environments before running the test scripts, rather than defaulting to the base `dpgce` environment. * Added a `verify_rapids` method that activates the `rapids` environment and successfully imports `cuml`. * Integrated calls to `verify_pytorch`, `verify_tensorflow`, and `verify_rapids` into the `test_install_gpu_cuda_nvidia_with_spark_job` test case. * Suppressed stderr on NUMA node listing to avoid log noise on VMs without NUMA nodes. * Fixed an invalid escape sequence `\s` warning by using a raw string for the Perl certificate verification command. * Added a workaround in `setUpClass` to explicitly map `PROJECT_ID` and `REGION` environment variables to `DataprocTestCase` to fix an issue where these variables are lost during `stage_init_actions`. --- gpu/test_gpu.py | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f1fda23ef..18f870d06 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -1,5 +1,6 @@ import pkg_resources import time +import os from absl.testing import absltest from absl.testing import parameterized @@ -18,6 +19,15 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): GPU_A100 = "type=nvidia-tesla-a100,count=2" GPU_H100 = "type=nvidia-h100-80gb,count=2" + @classmethod + def setUpClass(cls): + # Hack to workaround a bug in DataprocTestCase.setUpClass where it hardcodes + # DataprocTestCase().stage_init_actions() and loses the cls variables + import os + DataprocTestCase.PROJECT = os.getenv("PROJECT_ID") + DataprocTestCase.REGION = os.getenv("REGION") + super().setUpClass() + # Tests for PyTorch TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py" @@ -63,18 +73,16 @@ def verify_pytorch(self, name): self.TORCH_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) - conda_env="dpgce" - # until the numa node is selected, every time the GPU is accessed # from pytorch, log noise about numa node not being selected is # printed to the console. Selecting numa node before the python is # executed improves readability of the diagnostic information. - + env_name = "pytorch" verify_cmd = \ - "env={} ; envpath=/opt/conda/default/envs/${env} ; ".format(conda_env) + \ - "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ - "${envpath}/bin/python {}".format( - self.TORCH_TEST_SCRIPT_FILE_NAME) + "source /opt/conda/default/etc/profile.d/conda.sh && conda activate {} && ".format(env_name) + \ + "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > ${f} ; done ;" + \ + "/opt/conda/default/envs/{}/bin/python {}".format( + env_name, self.TORCH_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) @@ -83,15 +91,23 @@ def verify_tensorflow(self, name): self.TF_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) # all on a single numa node - conda_env="dpgce" + env_name = "tensorflow" verify_cmd = \ - "env={} ; envpath=/opt/conda/default/envs/${env} ; ".format(conda_env) + \ - "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ - "${envpath}/bin/python {}".format( - self.TF_TEST_SCRIPT_FILE_NAME) + "source /opt/conda/default/etc/profile.d/conda.sh && conda activate {} && ".format(env_name) + \ + "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > ${f} ; done ;" + \ + "/opt/conda/default/envs/{}/bin/python {}".format( + env_name, self.TF_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) + def verify_rapids(self, name): + # Verify that rapids works + env_name = "rapids" + verify_cmd = \ + "source /opt/conda/default/etc/profile.d/conda.sh && conda activate {} && ".format(env_name) + \ + "python -c 'import cuml; print(\"CUML Imported Successfully\")'" + self.assert_instance_command(name, verify_cmd) + def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -163,7 +179,7 @@ def verify_driver_signature(self, name): if self.getImageOs() == 'ubuntu': cert_path='/var/lib/shim-signed/mok/MOK.der' - cert_verification_cmd = """ + cert_verification_cmd = r""" perl -Mv5.10 -e ' my $cert = ( qx{openssl x509 -inform DER -in {} -text} =~ /Serial Number:.*? +(.+?)\s*$/ms ); @@ -413,6 +429,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_gpu_agent(machine_name) + self.verify_pytorch(machine_name) + self.verify_tensorflow(machine_name) + self.verify_rapids(machine_name) self.verify_instance_spark() @parameterized.parameters( From 195335200202266b6afc7bf919d95eb2af8c6c03 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 16 May 2026 00:45:03 +0000 Subject: [PATCH 11/12] Fix minor bugs and increase integration test robustness This commit addresses several minor issues and improves the reliability of the initialization action and its integration tests: * **Prevent out-of-disk errors during Conda builds:** Changed `create_conda_env` to build tarballs in the `tmpdir` (tmpfs) rather than the `workdir` (/opt). This significantly reduces root disk usage during the creation of massive environments like PyTorch and Rapids. * **Increase test boot disk size:** Increased the default `boot_disk_size` in `test_gpu.py` from 50GB to 60GB to provide additional headroom and prevent the disk from filling up during heavy test combinations. * **Improve error logging:** Added a descriptive error message to `execute_with_retries` so the failing command is printed before exiting. * **Patch GPU agent metadata URL:** Added a `sed` command to dynamically patch `report_gpu_metrics.py` to use the fully qualified `http://metadata.google.internal/` instead of `http://metadata/`. * **Update GPU agent systemd unit:** Added `EnvironmentFile=-/etc/environment` to the `gpu-utilization-agent.service` to ensure it picks up proxy configurations and other environment variables. * **Suppress noise in backports repair:** Added `2>/dev/null` to the `curl` command fetching the `stable` Release file in `repair_old_backports` to suppress expected 404 errors on newer distributions. --- gpu/install_gpu_driver.sh | 7 +++++-- gpu/test_gpu.py | 14 +++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index d9da985c0..a90c1c203 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -64,7 +64,7 @@ function repair_old_backports { debdists="https://deb.debian.org/debian/dists" oldoldstable=$(curl "${curl_retry_args[@]}" "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); oldstable=$( curl "${curl_retry_args[@]}" "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); - stable=$( curl "${curl_retry_args[@]}" "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl "${curl_retry_args[@]}" "${debdists}/stable/Release" 2>/dev/null | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) @@ -590,6 +590,7 @@ function execute_with_retries() ( if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done + echo "ERROR: Command failed after 3 retries: ${cmd}" >&2 return 1 ) @@ -671,7 +672,7 @@ function create_conda_env() { for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > "${f}" || true ; done local build_tarball="${env_name}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" + local local_tarball="${tmpdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" if is_complete "install_env_${env_name}"; then @@ -1790,6 +1791,7 @@ function install_gpu_agent() { curl "${curl_retry_args[@]}" \ "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ + | sed -e 's|http://metadata/|http://metadata.google.internal/|g' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" python_interpreter="/opt/conda/miniconda3/bin/python3" @@ -1818,6 +1820,7 @@ Description=GPU Utilization Metric Agent [Service] Type=simple PIDFile=/run/gpu_agent.pid +EnvironmentFile=-/etc/environment ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' User=root Group=root diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 18f870d06..0153cc139 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -216,7 +216,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB") + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) @@ -250,7 +250,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) @@ -299,7 +299,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB") + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) @@ -347,7 +347,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", startup_script="gpu/mig.sh") for machine_suffix in ["w-0", "w-1"]: @@ -380,7 +380,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, - boot_disk_size="50GB", + boot_disk_size="60GB", timeout_in_minutes=90) self.verify_instance_spark() @@ -422,7 +422,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: @@ -480,7 +480,7 @@ def untested_driver_signing(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: hostname="{}-{}".format(self.getClusterName(),machine_suffix) From 1ae5a262be0c1642d058f0a6c37d32e47d13f496 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 20 May 2026 23:02:55 +0000 Subject: [PATCH 12/12] Fix Conda environment build failures and solver deadlocks on legacy OSes This commit addresses several critical issues with the creation of Conda environments (PyTorch, TensorFlow, RAPIDS) during GPU cluster initialization, specifically mitigating solver deadlocks on legacy operating systems (Dataproc 2.0 / Debian 10 / Ubuntu 18). * **Graceful Degradation for ML Environments on Legacy OSes:** * Dataproc 2.0 images rely on the classic Conda dependency solver, which frequently deadlocks when attempting to resolve massive modern ML graphs (like PyTorch and RAPIDS). * Wrapped the `conda create` execution in a strict 3-minute timeout (`timeout 3m`) for legacy OSes. If the timeout triggers, the script now cleanly skips the environment creation and outputs an advisory warning recommending Dataproc 2.1+ instead of failing the cluster creation. * Exempted `tensorflow` from the 3-minute timeout on legacy OSes, as its base environment resolution does not rely on massive, conflicting CUDA toolkits and can complete successfully given enough time. * **Improved Conda Environment Build Visibility:** * Replaced silent file redirection (`>`) with `tee` for `conda create` output, streaming progress to the main install log while still capturing errors. * Prepended `time` to the build commands on modern OSes to track duration in Cloud Logging. * **Resolved TensorFlow Segfaults and Bash Errors:** * Removed inline TensorFlow Python verification (`import tensorflow as tf`) from the root-level bash script. This prevents a `Segmentation fault` that was aborting the initialization script due to environment/library loading collisions. Validation is fully deferred to the Python integration tests. * Wrapped `conda deactivate` in `set +u` to prevent fatal unbound variable errors (e.g., `$PS1`) in strict bash environments. * **Optimized PyTorch Package Resolution:** * Explicitly added `pytorch-cuda=${CUDA_VERSION}` to the PyTorch package installation array. * Removed the `conda-forge` channel from the PyTorch installation command. * Together, these significantly shrink the dependency permutation space, helping the Conda solver correctly resolve the GPU variant without deadlocking. * **Integration Test Updates:** * Updated `gpu/test_gpu.py` to bypass `verify_pytorch` and `verify_rapids` on Dataproc `< 2.1` due to the expected 3-minute Conda solver timeout. * Ensured `verify_tensorflow` continues to run and validate on all OS versions. --- gpu/install_gpu_driver.sh | 49 +++++++++++++++++++++------------------ gpu/test_gpu.py | 9 +++++-- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index a90c1c203..4b591764a 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -790,8 +790,28 @@ function create_conda_env() { local conda_err_file="${tmpdir}/conda_create_${env_name}.err" echo "DEBUG: About to run ${conda_path} create for ${env_name}" set +e - "${conda_path}" create -y -n "${env_name}" "${packages[@]}" > "${conda_err_file}" 2>&1 - local conda_exit_code=$? + + if is_debian10 || is_ubuntu18; then + if [[ "${env_name}" == "tensorflow" ]]; then + "${conda_path}" create -y -n "${env_name}" "${packages[@]}" 2>&1 | tee "${conda_err_file}" + local conda_exit_code=${PIPESTATUS[0]} + else + timeout 3m "${conda_path}" create -y -n "${env_name}" "${packages[@]}" 2>&1 | tee "${conda_err_file}" + local conda_exit_code=${PIPESTATUS[0]} + + if [[ "${conda_exit_code}" == 124 ]]; then + echo "WARN: Timed out (3m) attempting to resolve ${env_name} dependencies." >&2 + echo "WARN: The classic Conda dependency solver frequently deadlocks when installing massive packages like PyTorch or RAPIDS." >&2 + echo "WARN: GPU-accelerated Machine Learning environments are not supported on Dataproc 2.0 (Debian 10/Ubuntu 18.04)." >&2 + echo "WARN: Please upgrade to Dataproc 2.1 or newer (Debian 11+/Ubuntu 20.04+) to utilize these features." >&2 + set -e + return 0 + fi + fi + else + time "${conda_path}" create -y -n "${env_name}" "${packages[@]}" 2>&1 | tee "${conda_err_file}" + local conda_exit_code=${PIPESTATUS[0]} + fi set -e echo "DEBUG: ${conda_path} create finished with exit code ${conda_exit_code}" @@ -821,28 +841,11 @@ function create_conda_env() { echo "Installing TensorFlow with GPU support using pip in '${env_name}' env..." python -m pip install --upgrade pip python -m pip install --no-cache-dir 'tensorflow[and-cuda]>=2.16.0,<2.17.0' - - # Verify TensorFlow GPU - echo "DEBUG: Verifying TensorFlow GPU inside init action..." - python <<-'EOF' -import tensorflow as tf -print(f"TF Version: {tf.__version__}") -print(f"GPU Available: {tf.config.list_physical_devices('GPU')}") -print(f"Build Info: {tf.sysconfig.get_build_info()}") -gpus = tf.config.list_physical_devices('GPU') -if not gpus: - print("ERROR: TensorFlow cannot detect GPU!") - exit(1) -print(f"TensorFlow GPU check passed: {gpus}") -EOF - if [[ $? -ne 0 ]]; then - echo "ERROR: TensorFlow GPU verification failed in ${env_name} environment." - exit 1 - fi - echo "DEBUG: TensorFlow verification done." fi + set +u # Temporarily disable unbound variable check conda deactivate + set -u # Re-enable unbound variable check echo "Packaging environment '${env_name}'" pushd "${envpath}" @@ -932,9 +935,9 @@ function install_pytorch() { # Create isolated PyTorch environment if ! is_complete install_env_pytorch; then echo "DEBUG: 062: About to create pytorch env" - local channels=('-c' 'pytorch' '-c' 'nvidia' '-c' 'conda-forge') + local channels=('-c' 'pytorch' '-c' 'nvidia') local pt_packages=( - "python=3.11" "pytorch" "torchvision" "torchaudio" "pyspark" "numba" + "python=3.11" "pytorch" "torchvision" "torchaudio" "pytorch-cuda=${CUDA_VERSION}" "pyspark" "numba" ) create_conda_env "pytorch" "${channels[@]}" "${pt_packages[@]}" echo "DEBUG: 062: create_conda_env pytorch finished with exit code $?" diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 0153cc139..1434a53ab 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -429,9 +429,14 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_gpu_agent(machine_name) - self.verify_pytorch(machine_name) + self.verify_tensorflow(machine_name) - self.verify_rapids(machine_name) + if self.getImageVersion() >= pkg_resources.parse_version("2.1"): + self.verify_pytorch(machine_name) + self.verify_rapids(machine_name) + else: + print("Skipping PyTorch and RAPIDS verification on Dataproc < 2.1 due to expected Conda solver timeout.") + self.verify_instance_spark() @parameterized.parameters(