DeepLearningBaseTraining/submit_inf at master · JonasKonrad/DeepLearningBaseTraining · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash

#SBATCH --job-name=name___RUN__         # the name of your job
#SBATCH --export=NONE               # Start with a clean environment
#SBATCH --nodes=2                   # the number of nodes you want to reserve
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=6
#SBATCH --mem=100G                   # how much memory is needed per node (units can be: K, M, G, T)
#SBATCH --partition=gpu2080          # on which partition to submit the job
#SBATCH --time=48:00:00             # the max wallclock time (time limit your job will run)
#SBATCH --mail-type=FAIL             # receive an email when your job starts, finishes normally or is aborted
#SBATCH --mail-user=m_beck52@uni-muenster.de # your mail address
#SBATCH --nice=80

# --> inf job <---
LAST_JOB_ID=$(echo $SLURM_JOB_DEPENDENCY | cut -f2- -d:)
if sacct -j $LAST_JOB_ID -o state | grep -q TIMEOUT
then
    echo "found timeout in last job $LAST_JOB_ID -> continue"
else
    echo "error or no timeout in job $LAST_JOB_ID -> exit"
    exit 1
fi
NEXT_JOB_ID=$(sbatch --dependency=afternotok:$SLURM_JOBID submit_inf)
NEXT_JOB_ID=${NEXT_JOB_ID##* }

# --> torch distributed setup <---
export GPUS_PER_NODE=4
# The first hostname is the master address
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
export MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
export WORLD_SIZE=$((SLURM_JOB_NUM_NODES*GPUS_PER_NODE))

# START THE APPLICATION
module purge
module load palma/2022a  GCCcore/11.3.0
module load Python/3.10.4

# --> run <--
srun --export=ALL python3 main.py --logSubDir __DIR__ --contin