forked from MarlonBecker/DeepLearningBaseTraining
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubmit_inf
More file actions
43 lines (37 loc) · 1.67 KB
/
submit_inf
File metadata and controls
43 lines (37 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
#SBATCH --job-name=name___RUN__ # the name of your job
#SBATCH --export=NONE # Start with a clean environment
#SBATCH --nodes=2 # the number of nodes you want to reserve
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=6
#SBATCH --mem=100G # how much memory is needed per node (units can be: K, M, G, T)
#SBATCH --partition=gpu2080 # on which partition to submit the job
#SBATCH --time=48:00:00 # the max wallclock time (time limit your job will run)
#SBATCH --mail-type=FAIL # receive an email when your job starts, finishes normally or is aborted
#SBATCH --mail-user=m_beck52@uni-muenster.de # your mail address
#SBATCH --nice=80
# --> inf job <---
LAST_JOB_ID=$(echo $SLURM_JOB_DEPENDENCY | cut -f2- -d:)
if sacct -j $LAST_JOB_ID -o state | grep -q TIMEOUT
then
echo "found timeout in last job $LAST_JOB_ID -> continue"
else
echo "error or no timeout in job $LAST_JOB_ID -> exit"
exit 1
fi
NEXT_JOB_ID=$(sbatch --dependency=afternotok:$SLURM_JOBID submit_inf)
NEXT_JOB_ID=${NEXT_JOB_ID##* }
# --> torch distributed setup <---
export GPUS_PER_NODE=4
# The first hostname is the master address
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
export MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
export WORLD_SIZE=$((SLURM_JOB_NUM_NODES*GPUS_PER_NODE))
# START THE APPLICATION
module purge
module load palma/2022a GCCcore/11.3.0
module load Python/3.10.4
# --> run <--
srun --export=ALL python3 main.py --logSubDir __DIR__ --contin