forked from MarlonBecker/DeepLearningBaseTraining
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubmit
More file actions
48 lines (40 loc) · 1.85 KB
/
submit
File metadata and controls
48 lines (40 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/bash
#SBATCH --job-name=name
#SBATCH --export=NONE # Start with a clean environment
#SBATCH --nodes=1 # the number of nodes you want to reserve
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --mem=220G # how much memory is needed per node (units can be: K, M, G, T)
#SBATCH --partition=gpua100 # on which partition to submit the job
#SBATCH --time=7-00:00:00 # the max wallclock time (time limit your job will run)
#SBATCH --output=output.dat # the file where output is written to (stdout & stderr)
#SBATCH --mail-type=FAIL # receive an email when your job starts, finishes normally or is aborted
#SBATCH --mail-user=m_beck52@uni-muenster.de # your mail address
#SBATCH --nice=100
echo "<<<< SLURM VARIABLES >>>>>"
printenv | grep SLURM
echo "<<<< 'scontrol show jobid -dd' OUTPUT >>>>>"
scontrol show jobid -dd $SLURM_JOBID
echo "<<<< GPU INFO >>>>>"
nvidia-smi -L
if ! command -v module &> /dev/null
then
echo "module not found! using dirty workaround instead"
echo source /opt/lmod/lmod/init/sh
export MODULEPATH=/Applic.HPC/modules/PALMA:/Applic.ZIV/modulefiles/
source /opt/lmod/lmod/init/sh
echo source /etc/profile
source /etc/profile
export HOME=/home/m/m_beck52
fi
# The first hostname is the master address
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
export MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
export WORLD_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_NTASKS_PER_NODE))
# START THE APPLICATION
module purge
module load palma/2022a GCCcore/11.3.0
module load Python/3.10.4
srun --export=ALL python3 main.py --rndSeed --dataDir /scratch/tmp/m_beck52/data --dataThreads $((SLURM_CPUS_PER_TASK-1)) $@