-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathpt_multinode_helper_funcs.sh
More file actions
executable file
·98 lines (80 loc) · 2.34 KB
/
pt_multinode_helper_funcs.sh
File metadata and controls
executable file
·98 lines (80 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#
# Helper functions for running multi-node trainings using PyTorch.
#
# (c) 2021, Brian J. Stucky
# UF Research Computing
#
# Global environment constants recommended by Nvidia.
EXCLUDE_IB_LIST=mlx5_4,mlx5_5,mlx5_10,mlx5_11
export NCCL_DEBUG=WARN
export NCCL_IB_HCA=^${EXCLUDE_IB_LIST}
export NCCL_SOCKET_IFNAME=bridge-1145
#
# Returns a random, unused TCP port number
#
get_unused_port() {
# Well-known ports end at 1023. On Linux, dynamic ports start at 32768
# (see /proc/sys/net/ipv4/ip_local_port_range).
local MIN_PORT=1024
local MAX_PORT=32767
local USED_PORTS=$(netstat -a -n -t | tail -n +3 | tr -s ' ' | \
cut -d ' ' -f 4 | sed 's/.*:\([0-9]\+\)$/\1/' | sort -n | uniq)
# Generate random port numbers within the search range (inclusive) until we
# find one that isn't in use.
local RAN_PORT
while
RAN_PORT=$(shuf -i 1024-32767 -n 1)
[[ "$USED_PORTS" =~ $RAN_PORT ]]
do
continue
done
echo $RAN_PORT
}
#
# Initializes information about the nodes for the current run.
#
init_node_info() {
export PRIMARY=$(hostname -s)
SECONDARIES=$(scontrol show hostnames $SLURM_JOB_NODELIST | \
grep -v $PRIMARY)
ALL_NODES="$PRIMARY $SECONDARIES"
export PRIMARY_PORT=$(get_unused_port)
}
#
# Runs a command on a host node and will attempt to re-run the command upon
# failure. The command to run must be passed as an argument; the maximum number
# of tries and timeout before a retry can also be passed as arguments. The
# default MAX_TRIES is 3 and the default WAIT_TIME is 4.
#
# Usage: run_with_retry "$COMMAND" [$MAX_TRIES [$WAIT_TIME]]
#
run_with_retry() {
# Get function arguments, using defaults as needed.
local COMMAND=$1
local MAX_TRIES=$2
if [ -z $MAX_TRIES ]
then
MAX_TRIES=3
fi
local WAIT_TIME=$3
if [ -z $WAIT_TIME ]
then
WAIT_TIME=4
fi
local TRY_N=0
local RETVAL=1
# Run the command, retrying as needed until MAX_TRIES is exceeded.
while
TRY_N=$(($TRY_N + 1))
$COMMAND
RETVAL=$?
[ $MAX_TRIES -gt $TRY_N ] && [ $RETVAL != 0 ]
do
echo "Run failed on $(hostname -a); retrying in $WAIT_TIME second(s)..."
sleep $WAIT_TIME
done
if [ $RETVAL != 0 ]
then
echo "Run failure on $(hostname -a); maximum tries exceeded."
fi
}