From 81dcabea8d9e781f252e719de9e3839e351b46e8 Mon Sep 17 00:00:00 2001
From: Hongji Wang <jijijiang77@gmail.com>
Date: Fri, 23 Aug 2024 18:37:41 +0800
Subject: [PATCH 1/6] [diar] deprecate silero-vad v3.1 in v1

---
 examples/voxconverse/v1/README.md | 12 +++++++-----
 examples/voxconverse/v1/run.sh    |  5 ++---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/voxconverse/v1/README.md b/examples/voxconverse/v1/README.md
index 13ec2aef..0c51b555 100644
--- a/examples/voxconverse/v1/README.md
+++ b/examples/voxconverse/v1/README.md
@@ -1,11 +1,13 @@
 ## Overview
 
 * We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported.
-* Dataset: voxconverse_dev that consists of 216 utterances
-* Speaker model: ResNet34 model pretrained by wespeaker
+* Dataset: Voxconverse2020 (dev: 216 utts)
+* Speaker model: ResNet34 model pretrained by WeSpeaker
   * Refer to [voxceleb sv recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2)
   * [pretrained model path](https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx)
-* Speaker activity detection model: oracle SAD (from ground truth annotation) or system SAD (VAD model pretrained by silero, https://github.com/snakers4/silero-vad)
+* Speaker activity detection model:
+  * oracle SAD (from ground truth annotation)
+  * system SAD (VAD model pretrained by [silero-vad](https://github.com/snakers4/silero-vad), v3.1 is deprecated now)
 * Clustering method: spectral clustering
 * Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%)
 
@@ -15,8 +17,8 @@
 
     | system | MISS | FA | SC | DER |
     |:---|:---:|:---:|:---:|:---:|
-    | This repo (with oracle SAD) | 2.3 | 0.0 | 1.9 | 4.2 |
-    | This repo (with system SAD) | 3.7 | 0.8 | 2.0 | 6.5 |
+    | Ours (oracle SAD + spectral clustering) | 2.3 | 0.0 | 1.9 | 4.2 |
+    | Ours (silero-vad v3.1 + spectral clustering) | 3.7 | 0.8 | 2.0 | 6.5 |
     | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 |
     | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 |
     | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 |
diff --git a/examples/voxconverse/v1/run.sh b/examples/voxconverse/v1/run.sh
index b852ec80..f60bd3ad 100755
--- a/examples/voxconverse/v1/run.sh
+++ b/examples/voxconverse/v1/run.sh
@@ -29,8 +29,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools
 
     # [2] Download voice activity detection model pretrained by Silero Team
-    wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v3.1.zip -O external_tools/silero-vad-v3.1.zip
-    unzip -o external_tools/silero-vad-v3.1.zip -d external_tools
+    #wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v3.1.zip -O external_tools/silero-vad-v3.1.zip
+    #unzip -o external_tools/silero-vad-v3.1.zip -d external_tools
 
     # [3] Download ResNet34 speaker model pretrained by WeSpeaker Team
     mkdir -p pretrained_models
@@ -79,7 +79,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     if [[ "x${sad_type}" == "xsystem" ]]; then
        # System SAD: applying 'silero' VAD
        python3 wespeaker/diar/make_system_sad.py \
-               --repo-path external_tools/silero-vad-3.1 \
                --scp data/dev/wav.scp \
                --min-duration $min_duration > data/dev/system_sad
     fi

From b0f68903b955633d5e4ffd0cbd29695a0d4f4476 Mon Sep 17 00:00:00 2001
From: Hongji Wang <jijijiang77@gmail.com>
Date: Fri, 23 Aug 2024 18:45:54 +0800
Subject: [PATCH 2/6] [diar] 1. deprecate silero-vad v3.1 in v2; 2. add the
 "cluster_type" parameter and merge v3 into v2;

---
 examples/voxconverse/v2/README.md |  25 ++--
 examples/voxconverse/v2/run.sh    |  34 +++---
 examples/voxconverse/v3/README.md |  34 ------
 examples/voxconverse/v3/local     |   1 -
 examples/voxconverse/v3/path.sh   |   1 -
 examples/voxconverse/v3/run.sh    | 186 ------------------------------
 examples/voxconverse/v3/tools     |   1 -
 examples/voxconverse/v3/wespeaker |   1 -
 8 files changed, 33 insertions(+), 250 deletions(-)
 delete mode 100644 examples/voxconverse/v3/README.md
 delete mode 120000 examples/voxconverse/v3/local
 delete mode 120000 examples/voxconverse/v3/path.sh
 delete mode 100755 examples/voxconverse/v3/run.sh
 delete mode 120000 examples/voxconverse/v3/tools
 delete mode 120000 examples/voxconverse/v3/wespeaker

diff --git a/examples/voxconverse/v2/README.md b/examples/voxconverse/v2/README.md
index 02f41fa1..7a1d339a 100644
--- a/examples/voxconverse/v2/README.md
+++ b/examples/voxconverse/v2/README.md
@@ -1,12 +1,16 @@
 ## Overview
 
 * We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported.
-* Dataset: voxconverse_dev that consists of 216 utterances
-* Speaker model: ResNet34 model pretrained by wespeaker
+* Dataset: Voxconverse2020 (dev: 216 utts, test: 232 utts)
+* Speaker model: ResNet34 model pretrained by WeSpeaker
   * Refer to [voxceleb sv recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2)
   * [pretrained model path](https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx)
-* Speaker activity detection model: oracle SAD (from ground truth annotation) or system SAD (VAD model pretrained by silero, https://github.com/snakers4/silero-vad)
-* Clustering method: spectral clustering
+* Speaker activity detection model:
+  * oracle SAD (from ground truth annotation)
+  * system SAD (VAD model pretrained by [silero-vad](https://github.com/snakers4/silero-vad), v3.1 => v5.1)
+* Clustering method:
+  * spectral clustering
+  * umap dimensionality reduction + hdbscan clustering
 * Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%)
 
 ## Results
@@ -15,8 +19,11 @@
 
     | system | MISS | FA | SC | DER |
     |:---|:---:|:---:|:---:|:---:|
-    | This repo (with oracle SAD) | 2.3 | 0.0 | 2.1 | 4.4 |
-    | This repo (with system SAD) | 3.7 | 0.8 | 2.2 | 6.8 |
+    | Ours (oracle SAD + spectral clustering) | 2.3 | 0.0 | 2.1 | 4.4 |
+    | Ours (oracle SAD + umap clustering) | 2.3 | 0.0 | 1.3 | 3.6 |
+    | Ours (silero-vad v3.1 + spectral clustering) | 3.7 | 0.8 | 2.2 | 6.7 |
+    | Ours (silero-vad v5.1 + spectral clustering) | 3.4 | 0.6 | 2.3 | 6.3 |
+    | Ours (silero-vad v5.1 + umap clustering) | 3.4 | 0.6 | 1.4 | 5.4 |
     | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 |
     | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 |
     | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 |
@@ -27,7 +34,11 @@
 
     | system | MISS | FA | SC | DER |
     |:---|:---:|:---:|:---:|:---:|
-    | This repo (with system SAD) | 4.0 | 2.4 | 3.4 | 9.8 |
+    | Ours (oracle SAD + spectral clustering) | 1.6 | 0.0 | 3.3 | 4.9 |
+    | Ours (oracle SAD + umap clustering) | 1.6 | 0.0 | 1.9 | 3.5 |
+    | Ours (silero-vad v3.1 + spectral clustering) | 4.0 | 2.4 | 3.4 | 9.8 |
+    | Ours (silero-vad v5.1 + spectral clustering) | 3.8 | 1.7 | 3.3 | 8.8 |
+    | Ours (silero-vad v5.1 + umap clustering) | 3.8 | 1.7 | 1.8 | 7.3 |
 
 
 [^1]: Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf
diff --git a/examples/voxconverse/v2/run.sh b/examples/voxconverse/v2/run.sh
index 8e786297..d5bcd0c4 100755
--- a/examples/voxconverse/v2/run.sh
+++ b/examples/voxconverse/v2/run.sh
@@ -18,8 +18,9 @@
 
 stage=-1
 stop_stage=-1
-sad_type="oracle"
-partition="dev"
+sad_type="oracle"       # oracle/system
+partition="dev"         # dev/test
+cluster_type="spectral" # spectral/umap
 
 # do cmn on the sub-segment or on the vad segment
 subseg_cmn=true
@@ -36,11 +37,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     wget -c https://github.com/usnistgov/SCTK/archive/refs/tags/v2.4.12.zip -O external_tools/SCTK-v2.4.12.zip
     unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools
 
-    # [2] Download voice activity detection model pretrained by Silero Team
-    wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v3.1.zip -O external_tools/silero-vad-v3.1.zip
-    unzip -o external_tools/silero-vad-v3.1.zip -d external_tools
-
-    # [3] Download ResNet34 speaker model pretrained by WeSpeaker Team
+    # [2] Download ResNet34 speaker model pretrained by WeSpeaker Team
     mkdir -p pretrained_models
 
     wget -c https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx -O pretrained_models/voxceleb_resnet34_LM.onnx
@@ -101,7 +98,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     if [[ "x${sad_type}" == "xsystem" ]]; then
        # System SAD: applying 'silero' VAD
        python3 wespeaker/diar/make_system_sad.py \
-               --repo-path external_tools/silero-vad-3.1 \
                --scp data/${partition}/wav.scp \
                --min-duration $min_duration > data/${partition}/system_sad
     fi
@@ -144,24 +140,24 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
 fi
 
 
-# Applying spectral clustering algorithm
+# Applying spectral or ump+hdbscan clustering algorithm
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
 
-    [ -f "exp/spectral_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/spectral_cluster/${partition}_${sad_type}_sad_labels
+    [ -f "exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels
 
-    echo "Doing spectral clustering and store the result in exp/spectral_cluster/${partition}_${sad_type}_sad_labels"
+    echo "Doing ${cluster_type} clustering and store the result in exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels"
     echo "..."
-    python3 wespeaker/diar/spectral_clusterer.py \
+    python3 wespeaker/diar/${cluster_type}_clusterer.py \
             --scp exp/${partition}_${sad_type}_sad_embedding/emb.scp \
-            --output exp/spectral_cluster/${partition}_${sad_type}_sad_labels
+            --output exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels
 fi
 
 
 # Convert labels to RTTMs
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
     python3 wespeaker/diar/make_rttm.py \
-            --labels exp/spectral_cluster/${partition}_${sad_type}_sad_labels \
-            --channel 1 > exp/spectral_cluster/${partition}_${sad_type}_sad_rttm
+            --labels exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels \
+            --channel 1 > exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm
 fi
 
 
@@ -173,18 +169,18 @@ if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
     perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
          -c 0.25 \
          -r <(cat ${ref_dir}/${partition}/*.rttm) \
-         -s exp/spectral_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/spectral_cluster/${partition}_${sad_type}_sad_res
+         -s exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_res
 
     if [ ${get_each_file_res} -eq 1 ];then
-        single_file_res_dir=exp/spectral_cluster/${partition}_${sad_type}_single_file_res
+        single_file_res_dir=exp/${cluster_type}_cluster/${partition}_${sad_type}_single_file_res
         mkdir -p $single_file_res_dir
         echo -e "\nGet the DER results for each file and the results will be stored underd ${single_file_res_dir}\n..."
 
-        awk '{print $2}' exp/spectral_cluster/${partition}_${sad_type}_sad_rttm | sort -u  | while read file_name; do
+        awk '{print $2}' exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm | sort -u  | while read file_name; do
             perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
                  -c 0.25 \
                  -r <(cat ${ref_dir}/${partition}/${file_name}.rttm) \
-                 -s <(grep "${file_name}" exp/spectral_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res
+                 -s <(grep "${file_name}" exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res
         done
         echo "Done!"
     fi
diff --git a/examples/voxconverse/v3/README.md b/examples/voxconverse/v3/README.md
deleted file mode 100644
index 5b333714..00000000
--- a/examples/voxconverse/v3/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## Overview
-
-* We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported.
-* Dataset: voxconverse_dev that consists of 216 utterances
-* Speaker model: ResNet34 model pretrained by wespeaker
-  * Refer to [voxceleb sv recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2)
-  * [pretrained model path](https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx)
-* Speaker activity detection model: oracle SAD (from ground truth annotation) or system SAD (VAD model pretrained by silero, https://github.com/snakers4/silero-vad)
-* Clustering method: umap dimensionality reduction + hdbscan clustering
-* Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%)
-
-## Results
-
-* Dev set
-
-    | system | MISS | FA | SC | DER |
-    |:---|:---:|:---:|:---:|:---:|
-    | This repo (with oracle SAD) | 2.3 | 0.0 | 1.3 | 3.6 |
-    | This repo (with system SAD) | 3.4 | 0.6 | 1.4 | 5.4 |
-    | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 |
-    | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 |
-    | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 |
-    | (AVSE ASD only) [^1] | 2.0 | 5.9 | 4.6 | 12.4 |
-    | (proposed) [^1] | 2.4 | 2.3 | 3.0 | 7.7 |
-
-* Test set
-
-    | system | MISS | FA | SC | DER |
-    |:---|:---:|:---:|:---:|:---:|
-    | This repo (with oracle SAD) | 1.6 | 0.0 | 1.9 | 3.5 |
-    | This repo (with system SAD) | 3.8 | 1.7 | 1.8 | 7.4 |
-
-
-[^1]: Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf
diff --git a/examples/voxconverse/v3/local b/examples/voxconverse/v3/local
deleted file mode 120000
index 8b1d5f97..00000000
--- a/examples/voxconverse/v3/local
+++ /dev/null
@@ -1 +0,0 @@
-../v2/local
\ No newline at end of file
diff --git a/examples/voxconverse/v3/path.sh b/examples/voxconverse/v3/path.sh
deleted file mode 120000
index b6a713c8..00000000
--- a/examples/voxconverse/v3/path.sh
+++ /dev/null
@@ -1 +0,0 @@
-../v2/path.sh
\ No newline at end of file
diff --git a/examples/voxconverse/v3/run.sh b/examples/voxconverse/v3/run.sh
deleted file mode 100755
index f53cfab5..00000000
--- a/examples/voxconverse/v3/run.sh
+++ /dev/null
@@ -1,186 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2022-2023 Xu Xiang
-#               2022 Zhengyang Chen (chenzhengyang117@gmail.com)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-. ./path.sh || exit 1
-
-stage=-1
-stop_stage=-1
-sad_type="oracle"
-partition="dev"
-
-# do cmn on the sub-segment or on the vad segment
-subseg_cmn=true
-# whether print the evaluation result for each file
-get_each_file_res=1
-
-. tools/parse_options.sh
-
-# Prerequisite
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    mkdir -p external_tools
-
-    # [1] Download evaluation toolkit
-    wget -c https://github.com/usnistgov/SCTK/archive/refs/tags/v2.4.12.zip -O external_tools/SCTK-v2.4.12.zip
-    unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools
-
-    # [3] Download ResNet34 speaker model pretrained by WeSpeaker Team
-    mkdir -p pretrained_models
-
-    wget -c https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx -O pretrained_models/voxceleb_resnet34_LM.onnx
-fi
-
-
-# Download VoxConverse dev/test audios and the corresponding annotations
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    mkdir -p data
-
-    # Download annotations for dev and test sets (version 0.0.3)
-    wget -c https://github.com/joonson/voxconverse/archive/refs/heads/master.zip -O data/voxconverse_master.zip
-    unzip -o data/voxconverse_master.zip -d data
-
-    # Download annotations from VoxSRC-23 validation toolkit (looks like version 0.0.2)
-    # cd data && git clone https://github.com/JaesungHuh/VoxSRC2023.git --recursive && cd -
-
-    # Download dev audios
-    mkdir -p data/dev
-
-    #wget --no-check-certificate -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_dev_wav.zip -O data/voxconverse_dev_wav.zip
-    # The above url may not be reachable, you can try the link below.
-    # This url is from https://github.com/joonson/voxconverse/blob/master/README.md
-    wget --no-check-certificate -c https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_dev_wav.zip -O data/voxconverse_dev_wav.zip
-    unzip -o data/voxconverse_dev_wav.zip -d data/dev
-
-    # Create wav.scp for dev audios
-    ls `pwd`/data/dev/audio/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/dev/wav.scp
-
-    # Test audios
-    mkdir -p data/test
-
-    #wget --no-check-certificate -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip
-    # The above url may not be reachable, you can try the link below.
-    # This url is from https://github.com/joonson/voxconverse/blob/master/README.md
-    wget  --no-check-certificate -c https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip
-    unzip -o data/voxconverse_test_wav.zip -d data/test
-
-    # Create wav.scp for test audios
-    ls `pwd`/data/test/voxconverse_test_wav/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/test/wav.scp
-fi
-
-
-# Voice activity detection
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # Set VAD min duration
-    min_duration=0.255
-
-    if [[ "x${sad_type}" == "xoracle" ]]; then
-        # Oracle SAD: handling overlapping or too short regions in ground truth RTTM
-        while read -r utt wav_path; do
-            python3 wespeaker/diar/make_oracle_sad.py \
-                    --rttm data/voxconverse-master/${partition}/${utt}.rttm \
-                    --min-duration $min_duration
-        done < data/${partition}/wav.scp > data/${partition}/oracle_sad
-    fi
-
-    if [[ "x${sad_type}" == "xsystem" ]]; then
-       # System SAD: applying 'silero' VAD
-       python3 wespeaker/diar/make_system_sad.py \
-               --scp data/${partition}/wav.scp \
-               --min-duration $min_duration > data/${partition}/system_sad
-    fi
-fi
-
-
-# Extract fbank features
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-
-    [ -d "exp/${sad_type}_sad_fbank" ] && rm -r exp/${sad_type}_sad_fbank
-
-    echo "Make Fbank features and store it under exp/${sad_type}_sad_fbank"
-    echo "..."
-    bash local/make_fbank.sh \
-            --scp data/${partition}/wav.scp \
-            --segments data/${partition}/${sad_type}_sad \
-            --store_dir exp/${partition}_${sad_type}_sad_fbank \
-            --subseg_cmn ${subseg_cmn} \
-            --nj 24
-fi
-
-# Extract embeddings
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-
-    [ -d "exp/${sad_type}_sad_embedding" ] && rm -r exp/${sad_type}_sad_embedding
-
-    echo "Extract embeddings and store it under exp/${sad_type}_sad_embedding"
-    echo "..."
-    bash local/extract_emb.sh \
-            --scp exp/${partition}_${sad_type}_sad_fbank/fbank.scp \
-            --pretrained_model pretrained_models/voxceleb_resnet34_LM.onnx \
-            --device cuda \
-            --store_dir exp/${partition}_${sad_type}_sad_embedding \
-            --batch_size 96 \
-            --frame_shift 10 \
-            --window_secs 1.5 \
-            --period_secs 0.75 \
-            --subseg_cmn ${subseg_cmn} \
-            --nj 1
-fi
-
-
-# Applying umap clustering algorithm
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-
-    [ -f "exp/umap_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/umap_cluster/${partition}_${sad_type}_sad_labels
-
-    echo "Doing umap clustering and store the result in exp/umap_cluster/${partition}_${sad_type}_sad_labels"
-    echo "..."
-    python3 wespeaker/diar/umap_clusterer.py \
-            --scp exp/${partition}_${sad_type}_sad_embedding/emb.scp \
-            --output exp/umap_cluster/${partition}_${sad_type}_sad_labels
-fi
-
-
-# Convert labels to RTTMs
-if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-    python3 wespeaker/diar/make_rttm.py \
-            --labels exp/umap_cluster/${partition}_${sad_type}_sad_labels \
-            --channel 1 > exp/umap_cluster/${partition}_${sad_type}_sad_rttm
-fi
-
-
-# Evaluate the result
-if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-    ref_dir=data/voxconverse-master/
-    #ref_dir=data/VoxSRC2023/voxconverse/
-    echo -e "Get the DER results\n..."
-    perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
-         -c 0.25 \
-         -r <(cat ${ref_dir}/${partition}/*.rttm) \
-         -s exp/umap_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/umap_cluster/${partition}_${sad_type}_sad_res
-
-    if [ ${get_each_file_res} -eq 1 ];then
-        single_file_res_dir=exp/umap_cluster/${partition}_${sad_type}_single_file_res
-        mkdir -p $single_file_res_dir
-        echo -e "\nGet the DER results for each file and the results will be stored underd ${single_file_res_dir}\n..."
-
-        awk '{print $2}' exp/umap_cluster/${partition}_${sad_type}_sad_rttm | sort -u  | while read file_name; do
-            perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
-                 -c 0.25 \
-                 -r <(cat ${ref_dir}/${partition}/${file_name}.rttm) \
-                 -s <(grep "${file_name}" exp/umap_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res
-        done
-        echo "Done!"
-    fi
-fi
diff --git a/examples/voxconverse/v3/tools b/examples/voxconverse/v3/tools
deleted file mode 120000
index c92f4172..00000000
--- a/examples/voxconverse/v3/tools
+++ /dev/null
@@ -1 +0,0 @@
-../../../tools
\ No newline at end of file
diff --git a/examples/voxconverse/v3/wespeaker b/examples/voxconverse/v3/wespeaker
deleted file mode 120000
index 900c560b..00000000
--- a/examples/voxconverse/v3/wespeaker
+++ /dev/null
@@ -1 +0,0 @@
-../../../wespeaker
\ No newline at end of file

From 9da681e9ebbcee7257470c660f5c756f522a9b42 Mon Sep 17 00:00:00 2001
From: Hongji Wang <jijijiang77@gmail.com>
Date: Fri, 23 Aug 2024 18:54:29 +0800
Subject: [PATCH 3/6] [diar] update README.md for voxconverse repices

---
 examples/voxconverse/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/voxconverse/README.md b/examples/voxconverse/README.md
index 3fb5772b..ab20d24c 100644
--- a/examples/voxconverse/README.md
+++ b/examples/voxconverse/README.md
@@ -1,3 +1,7 @@
 This is a **WeSpeaker** speaker diarization recipe on the Voxconverse 2020 dataset. It focused on a ``in the wild`` scenario, which was collected from YouTube videos with a semi-automatic pipeline and released for the diarization track in VoxSRC 2020 Challenge. See https://www.robots.ox.ac.uk/~vgg/data/voxconverse/ for more detailed information.
 
 Two recipes are provided, including **v1** and **v2**. Their only difference is that in **v2**, we split the Fbank extraction, embedding extraction and clustering modules to different stages. We recommend newcomers to follow the **v2** recipe and run it stage by stage.
+
+🔥 UPDATE 2024.08.20:
+* silero-vad v5.1 is used in place of v3.1
+* umap dimensionality reduction + hdbscan clustering is also supported
\ No newline at end of file

From 67bc9128008e9be23de30ded2bfc40ac2cf0805a Mon Sep 17 00:00:00 2001
From: Hongji Wang <jijijiang77@gmail.com>
Date: Fri, 23 Aug 2024 19:00:59 +0800
Subject: [PATCH 4/6] [diar] update README.md

---
 examples/voxconverse/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/voxconverse/README.md b/examples/voxconverse/README.md
index ab20d24c..85af1c2d 100644
--- a/examples/voxconverse/README.md
+++ b/examples/voxconverse/README.md
@@ -4,4 +4,4 @@ Two recipes are provided, including **v1** and **v2**. Their only difference is
 
 🔥 UPDATE 2024.08.20:
 * silero-vad v5.1 is used in place of v3.1
-* umap dimensionality reduction + hdbscan clustering is also supported
\ No newline at end of file
+* umap dimensionality reduction + hdbscan clustering is also supported in v2

From 418e98128b1684274e1e1e7edd17dd24b2ae18e5 Mon Sep 17 00:00:00 2001
From: Hongji Wang <jijijiang77@gmail.com>
Date: Fri, 23 Aug 2024 19:06:09 +0800
Subject: [PATCH 5/6] [diar] update voxconverse/v2/run.sh

---
 examples/voxconverse/v2/run.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/voxconverse/v2/run.sh b/examples/voxconverse/v2/run.sh
index d5bcd0c4..6c83171c 100755
--- a/examples/voxconverse/v2/run.sh
+++ b/examples/voxconverse/v2/run.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Copyright (c) 2022-2023 Xu Xiang
 #               2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+#               2024 Hongji Wang (jijijiang77@gmail.com)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 555bde300b609aa5a3eda228d406e258cb861bb7 Mon Sep 17 00:00:00 2001
From: Hongji Wang <jijijiang77@gmail.com>
Date: Fri, 23 Aug 2024 19:09:49 +0800
Subject: [PATCH 6/6] [docs] update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index afb05573..690e93e8 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ pre-commit install  # for clean and tidy code
 ```
 
 ## 🔥 News
-* 2024.08.20: Update diarization recipe for VoxConverse dataset by leveraging umap dimensionality reduction and hdbscan clustering, see [#347](https://github.com/wenet-e2e/wespeaker/pull/347).
+* 2024.08.20: Update diarization recipe for VoxConverse dataset by leveraging umap dimensionality reduction and hdbscan clustering, see [#347](https://github.com/wenet-e2e/wespeaker/pull/347) and [#352](https://github.com/wenet-e2e/wespeaker/pull/352).
 * 2024.08.18: Support using ssl pre-trained models as the frontend. The [WavLM recipe](https://github.com/wenet-e2e/wespeaker/blob/master/examples/voxceleb/v2/run_wavlm.sh) is also provided, see [#344](https://github.com/wenet-e2e/wespeaker/pull/344).
 * 2024.05.15: Add support for [quality-aware score calibration](https://arxiv.org/pdf/2211.00815), see [#320](https://github.com/wenet-e2e/wespeaker/pull/320).
 * 2024.04.25: Add support for the gemini-dfresnet model, see [#291](https://github.com/wenet-e2e/wespeaker/pull/291).