Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ from upstream in the cluster:
```bash
BUCKET=<your_init_actions_bucket>
CLUSTER=<cluster_name>
gsutil cp presto/presto.sh gs://${BUCKET}/
gcloud storage cp presto/presto.sh gs://${BUCKET}/
gcloud dataproc clusters create ${CLUSTER} --initialization-actions gs://${BUCKET}/presto.sh
```
Expand Down
11 changes: 10 additions & 1 deletion alluxio/alluxio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ readonly ALLUXIO_HOME=/opt/alluxio
readonly ALLUXIO_SITE_PROPERTIES=${ALLUXIO_HOME}/conf/alluxio-site.properties
readonly ALLUXIO_DOWNLOAD_URL=https://downloads.alluxio.io/downloads/files/${ALLUXIO_VERSION}/alluxio-${ALLUXIO_VERSION}-bin.tar.gz

function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; }
function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; }

GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
GSUTIL="gcloud storage"
if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then
GSUTIL="gsutil"
fi

# Downloads a file to the local machine from a remote HTTP(S) or GCS URI into the cwd
#
# Args:
Expand All @@ -41,7 +50,7 @@ download_file() {
local -r uri="$1"

if [[ "${uri}" == gs://* ]]; then
gsutil cp "${uri}" ./
${GSUTIL} cp "${uri}" ./
else
# TODO Add metadata header tag to the wget for filtering out in download metrics.
wget -nv --timeout=30 --tries=5 --retry-connrefused "${uri}"
Expand Down
2 changes: 1 addition & 1 deletion beam/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Then, upload the jar to a Cloud Storage path that clusters can access during
initialization.

```bash
gsutil cp \
gcloud storage cp \
./runners/flink/job-server/build/libs/beam-runners-flink_2.11-job-server-*-SNAPSHOT.jar \
<BEAM_JOB_SERVICE_DESTINATION>/beam-runners-flink_2.11-job-server-latest-SNAPSHOT.jar
```
Expand Down
11 changes: 10 additions & 1 deletion beam/beam.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ readonly START_FLINK_YARN_SESSION_METADATA_KEY='flink-start-yarn-session'
# Set this to true to start a flink yarn session at initialization time.
readonly START_FLINK_YARN_SESSION_DEFAULT=true

function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; }
function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; }

GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
GSUTIL="gcloud storage"
if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then
GSUTIL="gsutil"
fi

function is_master() {
local role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
if [[ "$role" == 'Master' ]]; then
Expand All @@ -40,7 +49,7 @@ function download_snapshot() {
readonly snapshot_url="${1}"
readonly protocol="$(echo "${snapshot_url}" | head -c5)"
if [ "${protocol}" = "gs://" ]; then
gsutil cp "${snapshot_url}" "${LOCAL_JAR_NAME}"
${GSUTIL} cp "${snapshot_url}" "${LOCAL_JAR_NAME}"
else
curl -o "${LOCAL_JAR_NAME}" "${snapshot_url}"
fi
Expand Down
10 changes: 5 additions & 5 deletions conda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ Where `create-my-cluster.sh` specifies a list of conda and/or pip packages to in
```
#!/usr/bin/env bash

gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh .
gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh .
gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh .
gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh .

chmod 755 ./*conda*.sh

Expand All @@ -100,9 +100,9 @@ Similarly, one can also specify a [conda environment yml file](https://github.co
CONDA_ENV_YAML_GSC_LOC="gs://my-bucket/path/to/conda-environment.yml"
CONDA_ENV_YAML_PATH="/root/conda-environment.yml"
echo "Downloading conda environment at $CONDA_ENV_YAML_GSC_LOC to $CONDA_ENV_YAML_PATH ... "
gsutil -m cp -r $CONDA_ENV_YAML_GSC_LOC $CONDA_ENV_YAML_PATH
gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh .
gsutil -m cp -r gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh .
gcloud storage cp $CONDA_ENV_YAML_GSC_LOC $CONDA_ENV_YAML_PATH
gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/bootstrap-conda.sh .
gcloud storage cp gs://goog-dataproc-initialization-actions-${REGION}/conda/install-conda-env.sh .

chmod 755 ./*conda*.sh

Expand Down
11 changes: 10 additions & 1 deletion connectors/connectors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ readonly BIGQUERY_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attribute
readonly SPARK_BIGQUERY_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/spark-bigquery-connector-url || true)
readonly HIVE_BIGQUERY_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/hive-bigquery-connector-url || true)

function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; }
function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; }

GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
GSUTIL="gcloud storage"
if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then
GSUTIL="gsutil"
fi

min_version() {
echo -e "$1\n$2" | sort -r -t'.' -n -k1,1 -k2,2 -k3,3 | tail -n1
}
Expand Down Expand Up @@ -128,7 +137,7 @@ update_connector_url() {

find "${vm_connectors_dir}/" -name "${pattern}" -delete

gsutil cp -P "${url}" "${vm_connectors_dir}/"
${GSUTIL} cp -P "${url}" "${vm_connectors_dir}/"

local -r jar_name=${url##*/}

Expand Down
2 changes: 1 addition & 1 deletion dask/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ You can also `ssh` into the cluster and execute Dask jobs from Python files. To
run jobs, you can either `scp` a file onto your cluster or use `gsutil` on the
cluster to download the Python file.

`gcloud compute ssh <cluster-name> --command="gsutil cp gs://path/to/file.py .;
`gcloud compute ssh <cluster-name> --command="gcloud storage cp gs://path/to/file.py .;
python file.py`

### Accessing Web UIs
Expand Down
21 changes: 16 additions & 5 deletions hbase/hbase.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@ readonly REALM=$(echo "${DOMAIN}" | awk '{print toupper($0)}')
readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
readonly FQDN=$(hostname -f)

function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; }
function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; }

GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
GSUTIL="gcloud storage"
GSUTIL_STAT="gcloud storage objects list --stat --fetch-encrypted-object-hashes"
if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then
GSUTIL="gsutil"
GSUTIL_STAT="gsutil -q stat"
fi

function retry_command() {
cmd="$1"
for ((i = 0; i < 10; i++)); do
Expand Down Expand Up @@ -223,7 +234,7 @@ EOF
kadmin.local -q "addprinc -randkey hbase/${m}.${DOMAIN}@${REALM}"
echo "Generating hbase keytab..."
kadmin.local -q "xst -k ${HBASE_HOME}/conf/hbase-${m}.keytab hbase/${m}.${DOMAIN}"
gsutil cp "${HBASE_HOME}/conf/hbase-${m}.keytab" \
${GSUTIL} cp "${HBASE_HOME}/conf/hbase-${m}.keytab" \
"${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-${m}.keytab"
done

Expand All @@ -232,17 +243,17 @@ EOF
kadmin.local -q "addprinc -randkey hbase/${CLUSTER_NAME}-w-${c}.${DOMAIN}"
echo "Generating hbase keytab..."
kadmin.local -q "xst -k ${HBASE_HOME}/conf/hbase-${CLUSTER_NAME}-w-${c}.keytab hbase/${CLUSTER_NAME}-w-${c}.${DOMAIN}"
gsutil cp "${HBASE_HOME}/conf/hbase-${CLUSTER_NAME}-w-${c}.keytab" \
${GSUTIL} cp "${HBASE_HOME}/conf/hbase-${CLUSTER_NAME}-w-${c}.keytab" \
"${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-${CLUSTER_NAME}-w-${c}.keytab"
done
touch /tmp/_success
gsutil cp /tmp/_success "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success"
${GSUTIL} cp /tmp/_success "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success"
fi
success=1
while [[ $success == "1" ]]; do
sleep 1
success=$(
gsutil -q stat "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success"
${GSUTIL_STAT} "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/_success"
echo $?
)
done
Expand All @@ -255,7 +266,7 @@ EOF
fi

# Copy keytab to machine
gsutil cp "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-$(hostname -s).keytab" $hbase_keytab_path
${GSUTIL} cp "${KEYTAB_BUCKET}/keytabs/${CLUSTER_NAME}/hbase-$(hostname -s).keytab" $hbase_keytab_path

# Change owner of keytab to hbase with read only permissions
if [ -f $hbase_keytab_path ]; then
Expand Down
11 changes: 10 additions & 1 deletion hive-llap/llap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ readonly INIT_ACTIONS_REPO="$(/usr/share/google/get_metadata_value attributes/in
# directory files ingestied will reside
readonly INIT_ACTIONS_DIR='/usr/lib/hive-llap'

function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; }
function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; }

GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
GSUTIL="gcloud storage"
if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then
GSUTIL="gsutil"
fi

function pre_flight_checks(){
# check for bad configurations
if [[ "${NUM_LLAP_NODES}" -ge "${WORKER_NODE_COUNT}" ]]; then
Expand Down Expand Up @@ -69,7 +78,7 @@ function download_init_actions() {
# Download initialization actions locally. This will download the start_llap.sh file to the cluster for execution Check if metadata is supplied
echo "downalod init actions supplied as metadata..."
mkdir -p "${INIT_ACTIONS_DIR}"
gsutil cp "${INIT_ACTIONS_REPO}/hive-llap/start_llap.sh" "${INIT_ACTIONS_DIR}"
${GSUTIL} cp "${INIT_ACTIONS_REPO}/hive-llap/start_llap.sh" "${INIT_ACTIONS_DIR}"
chmod 700 "${INIT_ACTIONS_DIR}/start_llap.sh"
}

Expand Down
19 changes: 15 additions & 4 deletions mlvm/mlvm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ R_VERSION="$(R --version | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/
readonly R_VERSION
readonly SPARK_NLP_VERSION="3.2.1" # Must include subminor version here

function version_le() { [[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]]; }
function version_lt() { [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; }

GCLOUD_SDK_VERSION="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
GSUTIL="gcloud storage"
GSUTIL_OPTS=""
if version_lt "${GCLOUD_SDK_VERSION}" "402.0.0"; then
GSUTIL="gsutil"
GSUTIL_OPTS="-m"
fi

CONDA_PACKAGES=(
"r-dplyr=1.0"
"r-essentials=${R_VERSION}"
Expand Down Expand Up @@ -97,9 +108,9 @@ function download_init_actions() {
# Download initialization actions locally.
mkdir "${INIT_ACTIONS_DIR}"/{gpu,rapids,dask}

gsutil -m rsync -r "${INIT_ACTIONS_REPO}/rapids/" "${INIT_ACTIONS_DIR}/rapids/"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may be having additional options variable would be needed.
like for legacy gsutil GSUTIL_OPTS="-m" and keep this empty for latest gcloud storage.

then use GSUTIL_OPTS

${GSUTIL} "${GSUTIL_OPTS}" rsync -r "${INIT_ACTIONS_REPO}/rapids/" "${INIT_ACTIONS_DIR}/rapids/"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as suggested updated the commands

gsutil -m rsync -r "${INIT_ACTIONS_REPO}/gpu/" "${INIT_ACTIONS_DIR}/gpu/"
gsutil -m rsync -r "${INIT_ACTIONS_REPO}/dask/" "${INIT_ACTIONS_DIR}/dask/"
${GSUTIL} ${GSUTIL_OPTS} rsync -r "${INIT_ACTIONS_REPO}/rapids/" "${INIT_ACTIONS_DIR}/rapids/"
${GSUTIL} ${GSUTIL_OPTS} rsync -r "${INIT_ACTIONS_REPO}/gpu/" "${INIT_ACTIONS_DIR}/gpu/"
${GSUTIL} ${GSUTIL_OPTS} rsync -r "${INIT_ACTIONS_REPO}/dask/" "${INIT_ACTIONS_DIR}/dask/"

find "${INIT_ACTIONS_DIR}" -name '*.sh' -exec chmod +x {} \;
}
Expand Down Expand Up @@ -167,7 +178,7 @@ function install_spark_nlp() {
function install_connectors() {
local -r url="gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-${SPARK_BIGQUERY_VERSION}.jar"

gsutil cp "${url}" "${CONNECTORS_DIR}/"
${GSUTIL} cp "${url}" "${CONNECTORS_DIR}/"

local -r jar_name=${url##*/}

Expand Down