-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy path03-multi-node-basic-nodelist.json
More file actions
41 lines (41 loc) · 1.23 KB
/
03-multi-node-basic-nodelist.json
File metadata and controls
41 lines (41 loc) · 1.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
{
"_comment": "Multi-Node (2 nodes) on specific nodes - SLURM with nodelist",
"_description": "Same as 03-multi-node-basic but job runs only on the listed nodes (node01, node02). Node health preflight is skipped when nodelist is set.",
"_use_case": "Pin multi-node workload to specific nodes; override node01,node02 for your cluster.",
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"slurm": {
"partition": "amd-rccl",
"nodes": 2,
"nodelist": "node01,node02",
"gpus_per_node": 8,
"time": "24:00:00",
"output_dir": "./slurm_results",
"exclusive": true,
"network_interface": "eth0"
},
"distributed": {
"launcher": "torchrun",
"backend": "nccl",
"port": 29500,
"nnodes": 2,
"nproc_per_node": 8
},
"env_vars": {
"NCCL_DEBUG": "WARN",
"NCCL_DEBUG_SUBSYS": "INIT,NET",
"NCCL_IB_DISABLE": "1",
"NCCL_SOCKET_IFNAME": "eth0",
"TORCH_NCCL_HIGH_PRIORITY": "1",
"GPU_MAX_HW_QUEUES": "2",
"TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
"NCCL_TIMEOUT": "600",
"HSA_ENABLE_SDMA": "0",
"OMP_NUM_THREADS": "8",
"MIOPEN_FIND_MODE": "1",
"MIOPEN_USER_DB_PATH": "/tmp/.miopen",
"HSA_FORCE_FINE_GRAIN_PCIE": "1",
"RCCL_ENABLE_HIPGRAPH": "0"
},
"debug": false
}