Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions examples/libE_submission_scripts/bebop_submit_pbs_central.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash -l
#PBS -l select=4
#PBS -l walltime=00:15:00
#PBS -q bdwall
#PBS -A [project]
#PBS -N libE_example


cd $PBS_O_WORKDIR
# Choose MPI backend. Note that the built mpi4py in your environment should match.
module load oneapi/mpi
# module load openmpi

python run_libe_example.py -n 16
12 changes: 0 additions & 12 deletions examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh

This file was deleted.

24 changes: 0 additions & 24 deletions examples/libE_submission_scripts/bebop_submit_slurm_central.sh

This file was deleted.

89 changes: 89 additions & 0 deletions examples/libE_submission_scripts/submit_distrib_mpi4py.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash
#SBATCH -J libE_test
#SBATCH -N 4
#SBATCH -p [queue]
#SBATCH -A <my_project>
#SBATCH -o tlib.%j.%N.out
#SBATCH -e tlib.%j.%N.error
#SBATCH -t 01:00:00

# Launch script that runs in distributed mode with mpi4py.
# Workers are evenly spread over nodes and manager added to the first node.
# Requires even distribution - either multiple workers per node or nodes per worker
# Option for manager to have a dedicated node.
# Use of MPI Executor will ensure workers co-locate tasks with workers
# If node_list file is kept, this informs libe of resources. Else, libe auto-detects.

# User to edit these variables
export EXE=libE_calling_script.py
export NUM_WORKERS=4
export MANAGER_NODE=false # true = Manager has a dedicated node (assign one extra)
export USE_NODE_LIST=true # If false, allow libE to determine node_list from environment.

# As libE shares nodes with user applications allow fallback if contexts overrun.
unset I_MPI_FABRICS
export I_MPI_FABRICS_LIST=tmi,tcp
export I_MPI_FALLBACK=1

# If using in calling script (After N mins manager kills workers and exits cleanly)
export LIBE_WALLCLOCK=55

#-----------------------------------------------------------------------------
# Work out distribution
if [[ $MANAGER_NODE = "true" ]]; then
WORKER_NODES=$(($SLURM_NNODES-1))
else
WORKER_NODES=$SLURM_NNODES
fi

if [[ $NUM_WORKERS -ge $WORKER_NODES ]]; then
SUB_NODE_WORKERS=true
WORKERS_PER_NODE=$(($NUM_WORKERS/$WORKER_NODES))
else
SUB_NODE_WORKERS=false
NODES_PER_WORKER=$(($WORKER_NODES/$NUM_WORKERS))
fi;
#-----------------------------------------------------------------------------

# A little useful information
echo -e "Manager process running on: $HOSTNAME"
echo -e "Directory is: $PWD"

# Generate a node list with 1 node per line:
srun hostname | sort -u > node_list

# Add manager node to machinefile
head -n 1 node_list > machinefile.$SLURM_JOBID

# Add worker nodes to machinefile
if [[ $SUB_NODE_WORKERS = "true" ]]; then
awk -v repeat=$WORKERS_PER_NODE '{for(i=0; i<repeat; i++)print}' node_list \
>>machinefile.$SLURM_JOBID
else
awk -v patt="$NODES_PER_WORKER" 'NR % patt == 1' node_list \
>> machinefile.$SLURM_JOBID
fi;

if [[ $USE_NODE_LIST = "false" ]]; then
rm node_list
wait
fi;

# Put in a timestamp
echo Starting execution at: `date`

# To use srun
export SLURM_HOSTFILE=machinefile.$SLURM_JOBID

# The "arbitrary" flag should ensure SLURM_HOSTFILE is picked up
# cmd="srun --ntasks $(($NUM_WORKERS+1)) -m arbitrary python $EXE"
cmd="srun --ntasks $(($NUM_WORKERS+1)) -m arbitrary python $EXE $LIBE_WALLCLOCK"

echo The command is: $cmd
echo End PBS script information.
echo All further output is from the process being run and not the script.\n\n $cmd

$cmd

# Print the date again -- when finished
echo Finished at: `date`
Loading