-
Notifications
You must be signed in to change notification settings - Fork 24
/
ddp_apex.sb
40 lines (36 loc) · 1.14 KB
/
ddp_apex.sb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/bin/bash
#SBATCH --job-name="ddl_imagenet"
#SBATCH --output="ddl_imagenet.%j.%N.out"
#SBATCH --error="ddl_imagenet.%j.%N.err"
#SBATCH --partition=gpu
#SBATCH --time=24:00:00
#SBATCH --nodes=2
#SBATCH --cpus-per-task=40
#SBATCH --tasks-per-node=4
#SBATCH --mem-per-cpu=1200
#SBATCH --gres=gpu:v100:4
#SBATCH --export=ALL
##SBATCH --reservation=<reservation_name>
# return node name like "hal01"
MASTER=`/bin/hostname -s`
MASTER_IP=`/bin/hostname -i`
# all other node names other than the master node
SLAVES=`scontrol show hostnames $SLURM_JOB_NODELIST | grep -v $MASTER`
# Make sure this node (MASTER) comes first
HOSTLIST="$MASTER $SLAVES"
module load wmlce/1.6.2-py3.7
cd /home/kexu6/src/distributed-pytorch
#Launch the pytorch processes, first on master
# (first in $HOSTLIST) then on the slaves
RANK=0
for node in $HOSTLIST; do
srun -N 1 -n 1 python -m torch.distributed.launch \
--nproc_per_node=4 \
--nnodes=$SLURM_JOB_NUM_NODES \
--node_rank=$RANK \
--master_addr=$MASTER_IP --master_port=8888 \
imagenet_ddp_apex.py -a resnet50 -b 208 --workers 20 \
--opt-level O2 /home/shared/imagenet/raw/ &
RANK=$((RANK+1))
done
wait