diff --git a/examples/libE_submission_scripts/bebop_submit_pbs_central.sh b/examples/libE_submission_scripts/bebop_submit_pbs_central.sh new file mode 100644 index 0000000000..4d0aff5894 --- /dev/null +++ b/examples/libE_submission_scripts/bebop_submit_pbs_central.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#PBS -l select=4 +#PBS -l walltime=00:15:00 +#PBS -q bdwall +#PBS -A [project] +#PBS -N libE_example + + +cd $PBS_O_WORKDIR +# Choose MPI backend. Note that the built mpi4py in your environment should match. +module load oneapi/mpi +# module load openmpi + +python run_libe_example.py -n 16 diff --git a/examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh b/examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh deleted file mode 100644 index 756471d087..0000000000 --- a/examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -l -#PBS -l select=2:mpiprocs=16 -#PBS -l walltime=00:15:00 -#PBS -q bdwall -#PBS -A [project] -#PBS -N libE_example - - -cd $PBS_O_WORKDIR -module load openmpi - -mpirun -n 16 --ppn 8 python run_libe_example.py diff --git a/examples/libE_submission_scripts/bebop_submit_slurm_central.sh b/examples/libE_submission_scripts/bebop_submit_slurm_central.sh deleted file mode 100644 index f599a05099..0000000000 --- a/examples/libE_submission_scripts/bebop_submit_slurm_central.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -#SBATCH -J libE_test_central -#SBATCH -N 5 -#SBATCH -p knlall -#SBATCH -A -#SBATCH -o tlib.%j.%N.out -#SBATCH -e tlib.%j.%N.error -#SBATCH -t 01:00:00 - -# Launch script for running in central mode with mpi4py. -# libEnsemble will run on a dedicated node (or nodes). -# The remaining nodes in the allocation will be dedicated to worker launched apps. -# Use executor with auto-resources=True and central_mode=True. - -# User to edit these variables -export EXE=libE_calling_script.py -export NUM_WORKERS=4 -export I_MPI_FABRICS=shm:tmi - -# Overcommit will allow ntasks up to the no. of contexts on one node (eg. 320 on Bebop) -srun --overcommit --ntasks=$(($NUM_WORKERS+1)) --nodes=1 python $EXE - -# To use local mode instead of mpi4py (with parse_args()) -# python calling_script.py -n $NUM_WORKERS diff --git a/examples/libE_submission_scripts/submit_distrib_mpi4py.sh b/examples/libE_submission_scripts/submit_distrib_mpi4py.sh new file mode 100644 index 0000000000..d696c1dfa6 --- /dev/null +++ b/examples/libE_submission_scripts/submit_distrib_mpi4py.sh @@ -0,0 +1,89 @@ +#!/bin/bash +#SBATCH -J libE_test +#SBATCH -N 4 +#SBATCH -p [queue] +#SBATCH -A +#SBATCH -o tlib.%j.%N.out +#SBATCH -e tlib.%j.%N.error +#SBATCH -t 01:00:00 + +# Launch script that runs in distributed mode with mpi4py. +# Workers are evenly spread over nodes and manager added to the first node. +# Requires even distribution - either multiple workers per node or nodes per worker +# Option for manager to have a dedicated node. +# Use of MPI Executor will ensure workers co-locate tasks with workers +# If node_list file is kept, this informs libe of resources. Else, libe auto-detects. + +# User to edit these variables +export EXE=libE_calling_script.py +export NUM_WORKERS=4 +export MANAGER_NODE=false # true = Manager has a dedicated node (assign one extra) +export USE_NODE_LIST=true # If false, allow libE to determine node_list from environment. + +# As libE shares nodes with user applications allow fallback if contexts overrun. +unset I_MPI_FABRICS +export I_MPI_FABRICS_LIST=tmi,tcp +export I_MPI_FALLBACK=1 + +# If using in calling script (After N mins manager kills workers and exits cleanly) +export LIBE_WALLCLOCK=55 + +#----------------------------------------------------------------------------- +# Work out distribution +if [[ $MANAGER_NODE = "true" ]]; then + WORKER_NODES=$(($SLURM_NNODES-1)) +else + WORKER_NODES=$SLURM_NNODES +fi + +if [[ $NUM_WORKERS -ge $WORKER_NODES ]]; then + SUB_NODE_WORKERS=true + WORKERS_PER_NODE=$(($NUM_WORKERS/$WORKER_NODES)) +else + SUB_NODE_WORKERS=false + NODES_PER_WORKER=$(($WORKER_NODES/$NUM_WORKERS)) +fi; +#----------------------------------------------------------------------------- + +# A little useful information +echo -e "Manager process running on: $HOSTNAME" +echo -e "Directory is: $PWD" + +# Generate a node list with 1 node per line: +srun hostname | sort -u > node_list + +# Add manager node to machinefile +head -n 1 node_list > machinefile.$SLURM_JOBID + +# Add worker nodes to machinefile +if [[ $SUB_NODE_WORKERS = "true" ]]; then + awk -v repeat=$WORKERS_PER_NODE '{for(i=0; i>machinefile.$SLURM_JOBID +else + awk -v patt="$NODES_PER_WORKER" 'NR % patt == 1' node_list \ + >> machinefile.$SLURM_JOBID +fi; + +if [[ $USE_NODE_LIST = "false" ]]; then + rm node_list + wait +fi; + +# Put in a timestamp +echo Starting execution at: `date` + +# To use srun +export SLURM_HOSTFILE=machinefile.$SLURM_JOBID + +# The "arbitrary" flag should ensure SLURM_HOSTFILE is picked up +# cmd="srun --ntasks $(($NUM_WORKERS+1)) -m arbitrary python $EXE" +cmd="srun --ntasks $(($NUM_WORKERS+1)) -m arbitrary python $EXE $LIBE_WALLCLOCK" + +echo The command is: $cmd +echo End PBS script information. +echo All further output is from the process being run and not the script.\n\n $cmd + +$cmd + +# Print the date again -- when finished +echo Finished at: `date`