From d2e1bf2b63b9df141d9397d6544c2d74e7e2dbb0 Mon Sep 17 00:00:00 2001 From: Chengdong Liang <1404056823@qq.com> Date: Mon, 2 Dec 2024 14:26:40 +0800 Subject: [PATCH] [minor] support multi-node training (#389) * [train] support multi-node training * [train] modify localhost:0 to localhost:29400 --- examples/cnceleb/v2/run.sh | 11 ++++++++++- examples/cnceleb/v3_finetune/run.sh | 8 +++++++- examples/sre/v2/run.sh | 8 +++++++- examples/sre/v3/run.sh | 10 +++++++--- examples/voxceleb/v1/Whisper-PMFA/run.sh | 12 ++++++++++-- examples/voxceleb/v2/run.sh | 11 ++++++++++- examples/voxceleb/v2/run_wavlm.sh | 8 +++++++- examples/voxceleb/v2_deprecated/run.sh | 8 +++++++- examples/voxceleb/v3/dino/run.sh | 8 +++++++- examples/voxceleb/v3/moco/run.sh | 8 +++++++- examples/voxceleb/v3/simclr/run.sh | 8 +++++++- wespeaker/bin/train.py | 5 +++-- 12 files changed, 89 insertions(+), 16 deletions(-) diff --git a/examples/cnceleb/v2/run.sh b/examples/cnceleb/v2/run.sh index d813c60c..f0f7f216 100755 --- a/examples/cnceleb/v2/run.sh +++ b/examples/cnceleb/v2/run.sh @@ -7,9 +7,16 @@ . ./path.sh || exit 1 +# multi-node + multi-gpus: +# bash run.sh --stage 3 --stop-stage 3 --HOST_NODE_ADDR "xxx.xxx.xxx.xxx:port" --num_nodes num_node + stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -57,7 +64,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/cnceleb/v3_finetune/run.sh b/examples/cnceleb/v3_finetune/run.sh index 3a15f57a..79e9328f 100755 --- a/examples/cnceleb/v3_finetune/run.sh +++ b/examples/cnceleb/v3_finetune/run.sh @@ -10,6 +10,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -60,7 +64,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/sre/v2/run.sh b/examples/sre/v2/run.sh index 4fcffad6..16f41211 100755 --- a/examples/sre/v2/run.sh +++ b/examples/sre/v2/run.sh @@ -9,6 +9,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + # the sre data should be prepared in kaldi format and stored in the following directory # only wav.scp, utt2spk and spk2utt files are needed sre_data_dir=sre_data_dir @@ -65,7 +69,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/sre/v3/run.sh b/examples/sre/v3/run.sh index ccb39d21..fe6a82e2 100755 --- a/examples/sre/v3/run.sh +++ b/examples/sre/v3/run.sh @@ -22,6 +22,10 @@ stage=1 stop_stage=1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -194,9 +198,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then else num_gpus=$(echo $gpus | awk -F ',' '{print NF}') fi - echo "Using $num_gpus_train GPUs: $gpus" - #torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus_train \ # The below is to prevent problems if many jobs run on the same machine - torchrun --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$((RANDOM)) --nnodes=1 --nproc_per_node=$num_gpus_train \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/voxceleb/v1/Whisper-PMFA/run.sh b/examples/voxceleb/v1/Whisper-PMFA/run.sh index 577423f9..0b12d006 100644 --- a/examples/voxceleb/v1/Whisper-PMFA/run.sh +++ b/examples/voxceleb/v1/Whisper-PMFA/run.sh @@ -8,6 +8,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="raw" # shard/raw model=whisper_PMFA_large_v2 @@ -57,7 +61,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training with frozen whisper parameter..." config=conf/whisper_PMFA_stage0.yaml num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ @@ -84,7 +90,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then config=conf/whisper_PMFA_stage1.yaml num_gpus=$(echo $gpus | awk -F ',' '{print NF}') checkpoint=${exp_dir}/models/model_4.pt - torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/voxceleb/v2/run.sh b/examples/voxceleb/v2/run.sh index 955755a8..272996c9 100755 --- a/examples/voxceleb/v2/run.sh +++ b/examples/voxceleb/v2/run.sh @@ -5,9 +5,16 @@ . ./path.sh || exit 1 +# multi-node + multi-gpus: +# bash run.sh --stage 3 --stop-stage 3 --HOST_NODE_ADDR "xxx.xxx.xxx.xxx:port" --num_nodes num_node + stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -55,7 +62,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --master_addr=localhost --master_port=29401 --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/voxceleb/v2/run_wavlm.sh b/examples/voxceleb/v2/run_wavlm.sh index 4eb35a4e..40eed073 100755 --- a/examples/voxceleb/v2/run_wavlm.sh +++ b/examples/voxceleb/v2/run_wavlm.sh @@ -7,6 +7,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -57,7 +61,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --master_addr=localhost --master_port=29401 --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/voxceleb/v2_deprecated/run.sh b/examples/voxceleb/v2_deprecated/run.sh index f4817d32..26342abf 100755 --- a/examples/voxceleb/v2_deprecated/run.sh +++ b/examples/voxceleb/v2_deprecated/run.sh @@ -8,6 +8,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + config=conf/resnet.yaml exp_dir=exp/ResNet34-TSTP-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-SGD-epoch150 gpus="[0,1]" @@ -28,7 +32,9 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/bin/train_deprecated.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/voxceleb/v3/dino/run.sh b/examples/voxceleb/v3/dino/run.sh index 27e1bb39..2dd6b8bd 100755 --- a/examples/voxceleb/v3/dino/run.sh +++ b/examples/voxceleb/v3/dino/run.sh @@ -10,6 +10,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -54,7 +58,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/ssl/bin/train_dino.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/voxceleb/v3/moco/run.sh b/examples/voxceleb/v3/moco/run.sh index a70e7475..f59b95c3 100755 --- a/examples/voxceleb/v3/moco/run.sh +++ b/examples/voxceleb/v3/moco/run.sh @@ -10,6 +10,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -54,7 +58,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/ssl/bin/train_contrastive.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/examples/voxceleb/v3/simclr/run.sh b/examples/voxceleb/v3/simclr/run.sh index 9c44a420..3c3b1c8a 100755 --- a/examples/voxceleb/v3/simclr/run.sh +++ b/examples/voxceleb/v3/simclr/run.sh @@ -10,6 +10,10 @@ stage=-1 stop_stage=-1 +HOST_NODE_ADDR="localhost:29400" +num_nodes=1 +job_id=2024 + data=data data_type="shard" # shard/raw @@ -54,7 +58,9 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Start training ..." num_gpus=$(echo $gpus | awk -F ',' '{print NF}') - torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \ + echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \ wespeaker/ssl/bin/train_contrastive.py --config $config \ --exp_dir ${exp_dir} \ --gpus $gpus \ diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py index 63fb99bc..54ca0053 100644 --- a/wespeaker/bin/train.py +++ b/wespeaker/bin/train.py @@ -46,9 +46,10 @@ def train(config='conf/config.yaml', **kwargs): configs = parse_config_or_kwargs(config, **kwargs) checkpoint = configs.get('checkpoint', None) # dist configs - rank = int(os.environ['RANK']) + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + rank = int(os.environ.get('RANK', 0)) world_size = int(os.environ['WORLD_SIZE']) - gpu = int(configs['gpus'][rank]) + gpu = int(configs['gpus'][local_rank]) torch.cuda.set_device(gpu) dist.init_process_group(backend='nccl')