Skip to content

Commit

Permalink
[minor] support multi-node training (#389)
Browse files Browse the repository at this point in the history
* [train] support multi-node training

* [train] modify localhost:0 to localhost:29400
  • Loading branch information
cdliang11 authored Dec 2, 2024
1 parent 4fdc23a commit d2e1bf2
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 16 deletions.
11 changes: 10 additions & 1 deletion examples/cnceleb/v2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,16 @@

. ./path.sh || exit 1

# multi-node + multi-gpus:
# bash run.sh --stage 3 --stop-stage 3 --HOST_NODE_ADDR "xxx.xxx.xxx.xxx:port" --num_nodes num_node

stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -57,7 +64,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
8 changes: 7 additions & 1 deletion examples/cnceleb/v3_finetune/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -60,7 +64,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
8 changes: 7 additions & 1 deletion examples/sre/v2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

# the sre data should be prepared in kaldi format and stored in the following directory
# only wav.scp, utt2spk and spk2utt files are needed
sre_data_dir=sre_data_dir
Expand Down Expand Up @@ -65,7 +69,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
10 changes: 7 additions & 3 deletions examples/sre/v3/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
stage=1
stop_stage=1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -194,9 +198,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
else
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
fi
echo "Using $num_gpus_train GPUs: $gpus"
#torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus_train \ # The below is to prevent problems if many jobs run on the same machine
torchrun --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$((RANDOM)) --nnodes=1 --nproc_per_node=$num_gpus_train \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
12 changes: 10 additions & 2 deletions examples/voxceleb/v1/Whisper-PMFA/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="raw" # shard/raw
model=whisper_PMFA_large_v2
Expand Down Expand Up @@ -57,7 +61,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training with frozen whisper parameter..."
config=conf/whisper_PMFA_stage0.yaml
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand All @@ -84,7 +90,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
config=conf/whisper_PMFA_stage1.yaml
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
checkpoint=${exp_dir}/models/model_4.pt
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
11 changes: 10 additions & 1 deletion examples/voxceleb/v2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,16 @@

. ./path.sh || exit 1

# multi-node + multi-gpus:
# bash run.sh --stage 3 --stop-stage 3 --HOST_NODE_ADDR "xxx.xxx.xxx.xxx:port" --num_nodes num_node

stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -55,7 +62,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --master_addr=localhost --master_port=29401 --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
8 changes: 7 additions & 1 deletion examples/voxceleb/v2/run_wavlm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -57,7 +61,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --master_addr=localhost --master_port=29401 --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
8 changes: 7 additions & 1 deletion examples/voxceleb/v2_deprecated/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

config=conf/resnet.yaml
exp_dir=exp/ResNet34-TSTP-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-SGD-epoch150
gpus="[0,1]"
Expand All @@ -28,7 +32,9 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/bin/train_deprecated.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
8 changes: 7 additions & 1 deletion examples/voxceleb/v3/dino/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -54,7 +58,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/ssl/bin/train_dino.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
8 changes: 7 additions & 1 deletion examples/voxceleb/v3/moco/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -54,7 +58,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/ssl/bin/train_contrastive.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
8 changes: 7 additions & 1 deletion examples/voxceleb/v3/simclr/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
stage=-1
stop_stage=-1

HOST_NODE_ADDR="localhost:29400"
num_nodes=1
job_id=2024

data=data
data_type="shard" # shard/raw

Expand Down Expand Up @@ -54,7 +58,9 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Start training ..."
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \
echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
wespeaker/ssl/bin/train_contrastive.py --config $config \
--exp_dir ${exp_dir} \
--gpus $gpus \
Expand Down
5 changes: 3 additions & 2 deletions wespeaker/bin/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ def train(config='conf/config.yaml', **kwargs):
configs = parse_config_or_kwargs(config, **kwargs)
checkpoint = configs.get('checkpoint', None)
# dist configs
rank = int(os.environ['RANK'])
local_rank = int(os.environ.get('LOCAL_RANK', 0))
rank = int(os.environ.get('RANK', 0))
world_size = int(os.environ['WORLD_SIZE'])
gpu = int(configs['gpus'][rank])
gpu = int(configs['gpus'][local_rank])
torch.cuda.set_device(gpu)
dist.init_process_group(backend='nccl')

Expand Down

0 comments on commit d2e1bf2

Please sign in to comment.