diff --git a/README.md b/README.md index 9e143d414..541a872f8 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ from upstream in the cluster: ```bash BUCKET= CLUSTER= -gsutil cp presto/presto.sh gs://${BUCKET}/ +gcloud storage cp presto/presto.sh gs://${BUCKET}/ gcloud dataproc clusters create ${CLUSTER} --initialization-actions gs://${BUCKET}/presto.sh ``` diff --git a/alluxio/alluxio.sh b/alluxio/alluxio.sh index 9d41820d4..336581278 100644 --- a/alluxio/alluxio.sh +++ b/alluxio/alluxio.sh @@ -33,6 +33,15 @@ readonly ALLUXIO_HOME=/opt/alluxio readonly ALLUXIO_SITE_PROPERTIES=${ALLUXIO_HOME}/conf/alluxio-site.properties readonly ALLUXIO_DOWNLOAD_URL=https://downloads.alluxio.io/downloads/files/${ALLUXIO_VERSION}/alluxio-${ALLUXIO_VERSION}-bin.tar.gz +function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; } +function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; } + +GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" +GSUTIL="gcloud storage" +if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then + GSUTIL="gsutil" +fi + # Downloads a file to the local machine from a remote HTTP(S) or GCS URI into the cwd # # Args: @@ -41,7 +50,7 @@ download_file() { local -r uri="$1" if [[ "${uri}" == gs://* ]]; then - gsutil cp "${uri}" ./ + ${GSUTIL} cp "${uri}" ./ else # TODO Add metadata header tag to the wget for filtering out in download metrics. wget -nv --timeout=30 --tries=5 --retry-connrefused "${uri}" diff --git a/beam/README.md b/beam/README.md index e03de8c27..0aecdd1a7 100644 --- a/beam/README.md +++ b/beam/README.md @@ -62,7 +62,7 @@ Then, upload the jar to a Cloud Storage path that clusters can access during initialization. ```bash -gsutil cp \ +gcloud storage cp \ ./runners/flink/job-server/build/libs/beam-runners-flink_2.11-job-server-*-SNAPSHOT.jar \ /beam-runners-flink_2.11-job-server-latest-SNAPSHOT.jar ``` diff --git a/beam/beam.sh b/beam/beam.sh index 2ce1640bf..2fed3841e 100755 --- a/beam/beam.sh +++ b/beam/beam.sh @@ -22,6 +22,15 @@ readonly START_FLINK_YARN_SESSION_METADATA_KEY='flink-start-yarn-session' # Set this to true to start a flink yarn session at initialization time. readonly START_FLINK_YARN_SESSION_DEFAULT=true +function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; } +function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; } + +GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" +GSUTIL="gcloud storage" +if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then + GSUTIL="gsutil" +fi + function is_master() { local role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" if [[ "$role" == 'Master' ]]; then @@ -40,7 +49,7 @@ function download_snapshot() { readonly snapshot_url="${1}" readonly protocol="$(echo "${snapshot_url}" | head -c5)" if [ "${protocol}" = "gs://" ]; then - gsutil cp "${snapshot_url}" "${LOCAL_JAR_NAME}" + ${GSUTIL} cp "${snapshot_url}" "${LOCAL_JAR_NAME}" else curl -o "${LOCAL_JAR_NAME}" "${snapshot_url}" fi diff --git a/conda/README.md b/conda/README.md index a4227dc65..984762f97 100644 --- a/conda/README.md +++ b/conda/README.md @@ -77,8 +77,8 @@ Where `create-my-cluster.sh` specifies a list of conda and/or pip packages to in ``` #!/usr/bin/env bash -gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh . -gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh . +gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh . +gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh . chmod 755 ./*conda*.sh @@ -100,9 +100,9 @@ Similarly, one can also specify a [conda environment yml file](https://github.co CONDA_ENV_YAML_GSC_LOC="gs://my-bucket/path/to/conda-environment.yml" CONDA_ENV_YAML_PATH="/root/conda-environment.yml" echo "Downloading conda environment at $CONDA_ENV_YAML_GSC_LOC to $CONDA_ENV_YAML_PATH ... " -gsutil -m cp -r $CONDA_ENV_YAML_GSC_LOC $CONDA_ENV_YAML_PATH -gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh . -gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh . +gcloud storage cp $CONDA_ENV_YAML_GSC_LOC $CONDA_ENV_YAML_PATH +gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh . +gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh . chmod 755 ./*conda*.sh diff --git a/connectors/connectors.sh b/connectors/connectors.sh index 22157dafa..84305dfd0 100755 --- a/connectors/connectors.sh +++ b/connectors/connectors.sh @@ -27,6 +27,15 @@ readonly BIGQUERY_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attribute readonly SPARK_BIGQUERY_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/spark-bigquery-connector-url || true) readonly HIVE_BIGQUERY_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/hive-bigquery-connector-url || true) +function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; } +function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; } + +GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" +GSUTIL="gcloud storage" +if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then + GSUTIL="gsutil" +fi + min_version() { echo -e "$1\n$2" | sort -r -t'.' -n -k1,1 -k2,2 -k3,3 | tail -n1 } @@ -128,7 +137,7 @@ update_connector_url() { find "${vm_connectors_dir}/" -name "${pattern}" -delete - gsutil cp -P "${url}" "${vm_connectors_dir}/" + ${GSUTIL} cp -P "${url}" "${vm_connectors_dir}/" local -r jar_name=${url##*/} diff --git a/dask/README.md b/dask/README.md index 69d70738b..ec0f6909b 100644 --- a/dask/README.md +++ b/dask/README.md @@ -136,7 +136,7 @@ You can also `ssh` into the cluster and execute Dask jobs from Python files. To run jobs, you can either `scp` a file onto your cluster or use `gsutil` on the cluster to download the Python file. -`gcloud compute ssh --command="gsutil cp gs://path/to/file.py .; +`gcloud compute ssh --command="gcloud storage cp gs://path/to/file.py .; python file.py` ### Accessing Web UIs diff --git a/hbase/hbase.sh b/hbase/hbase.sh index 10724dab9..89010727c 100755 --- a/hbase/hbase.sh +++ b/hbase/hbase.sh @@ -34,6 +34,17 @@ readonly REALM=$(echo "${DOMAIN}" | awk '{print toupper($0)}') readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) readonly FQDN=$(hostname -f) +function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; } +function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; } + +GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" +GSUTIL="gcloud storage" +GSUTIL_STAT="gcloud storage objects list --stat --fetch-encrypted-object-hashes" +if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then + GSUTIL="gsutil" + GSUTIL_STAT="gsutil -q stat" +fi + function retry_command() { cmd="$1" for ((i = 0; i < 10; i++)); do @@ -223,7 +234,7 @@ EOF kadmin.local -q "addprinc -randkey hbase/${m}.${DOMAIN}@${REALM}" echo "Generating hbase keytab..." kadmin.local -q "xst -k ${HBASE_HOME}/conf/hbase-${m}.keytab hbase/${m}.${DOMAIN}" - gsutil cp "${HBASE_HOME}/conf/hbase-${m}.keytab" \ + ${GSUTIL} cp "${HBASE_HOME}/conf/hbase-${m}.keytab" \ "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-${m}.keytab" done @@ -232,17 +243,17 @@ EOF kadmin.local -q "addprinc -randkey hbase/${CLUSTER_NAME}-w-${c}.${DOMAIN}" echo "Generating hbase keytab..." kadmin.local -q "xst -k ${HBASE_HOME}/conf/hbase-${CLUSTER_NAME}-w-${c}.keytab hbase/${CLUSTER_NAME}-w-${c}.${DOMAIN}" - gsutil cp "${HBASE_HOME}/conf/hbase-${CLUSTER_NAME}-w-${c}.keytab" \ + ${GSUTIL} cp "${HBASE_HOME}/conf/hbase-${CLUSTER_NAME}-w-${c}.keytab" \ "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-${CLUSTER_NAME}-w-${c}.keytab" done touch /tmp/_success - gsutil cp /tmp/_success "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success" + ${GSUTIL} cp /tmp/_success "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success" fi success=1 while [[ $success == "1" ]]; do sleep 1 success=$( - gsutil -q stat "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success" + ${GSUTIL_STAT} "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success" echo $? ) done @@ -255,7 +266,7 @@ EOF fi # Copy keytab to machine - gsutil cp "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-$(hostname -s).keytab" $hbase_keytab_path + ${GSUTIL} cp "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-$(hostname -s).keytab" $hbase_keytab_path # Change owner of keytab to hbase with read only permissions if [ -f $hbase_keytab_path ]; then diff --git a/hive-llap/llap.sh b/hive-llap/llap.sh index 5009fb92a..88b44a1ef 100644 --- a/hive-llap/llap.sh +++ b/hive-llap/llap.sh @@ -39,6 +39,15 @@ readonly INIT_ACTIONS_REPO="$(/usr/share/google/get_metadata_value attributes/in # directory files ingestied will reside readonly INIT_ACTIONS_DIR='/usr/lib/hive-llap' +function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; } +function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; } + +GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" +GSUTIL="gcloud storage" +if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then + GSUTIL="gsutil" +fi + function pre_flight_checks(){ # check for bad configurations if [[ "${NUM_LLAP_NODES}" -ge "${WORKER_NODE_COUNT}" ]]; then @@ -69,7 +78,7 @@ function download_init_actions() { # Download initialization actions locally. This will download the start_llap.sh file to the cluster for execution Check if metadata is supplied echo "downalod init actions supplied as metadata..." mkdir -p "${INIT_ACTIONS_DIR}" - gsutil cp "${INIT_ACTIONS_REPO}/hive-llap/start_llap.sh" "${INIT_ACTIONS_DIR}" + ${GSUTIL} cp "${INIT_ACTIONS_REPO}/hive-llap/start_llap.sh" "${INIT_ACTIONS_DIR}" chmod 700 "${INIT_ACTIONS_DIR}/start_llap.sh" } diff --git a/mlvm/mlvm.sh b/mlvm/mlvm.sh index 320edfdc3..dcf231924 100644 --- a/mlvm/mlvm.sh +++ b/mlvm/mlvm.sh @@ -37,6 +37,17 @@ R_VERSION="$(R --version | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/ readonly R_VERSION readonly SPARK_NLP_VERSION="3.2.1" # Must include subminor version here +function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; } +function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; } + +GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" +GSUTIL="gcloud storage" +GSUTIL_OPTS="" +if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then + GSUTIL="gsutil" + GSUTIL_OPTS="-m" +fi + CONDA_PACKAGES=( "r-dplyr=1.0" "r-essentials=${R_VERSION}" @@ -97,9 +108,9 @@ function download_init_actions() { # Download initialization actions locally. mkdir "${INIT_ACTIONS_DIR}"/{gpu,rapids,dask} - gsutil -m rsync -r "${INIT_ACTIONS_REPO}/rapids/" "${INIT_ACTIONS_DIR}/rapids/" - gsutil -m rsync -r "${INIT_ACTIONS_REPO}/gpu/" "${INIT_ACTIONS_DIR}/gpu/" - gsutil -m rsync -r "${INIT_ACTIONS_REPO}/dask/" "${INIT_ACTIONS_DIR}/dask/" + ${GSUTIL} ${GSUTIL_OPTS} rsync -r "${INIT_ACTIONS_REPO}/rapids/" "${INIT_ACTIONS_DIR}/rapids/" + ${GSUTIL} ${GSUTIL_OPTS} rsync -r "${INIT_ACTIONS_REPO}/gpu/" "${INIT_ACTIONS_DIR}/gpu/" + ${GSUTIL} ${GSUTIL_OPTS} rsync -r "${INIT_ACTIONS_REPO}/dask/" "${INIT_ACTIONS_DIR}/dask/" find "${INIT_ACTIONS_DIR}" -name '*.sh' -exec chmod +x {} \; } @@ -167,7 +178,7 @@ function install_spark_nlp() { function install_connectors() { local -r url="gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-${SPARK_BIGQUERY_VERSION}.jar" - gsutil cp "${url}" "${CONNECTORS_DIR}/" + ${GSUTIL} cp "${url}" "${CONNECTORS_DIR}/" local -r jar_name=${url##*/}