From 81dcabea8d9e781f252e719de9e3839e351b46e8 Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Fri, 23 Aug 2024 18:37:41 +0800 Subject: [PATCH 1/6] [diar] deprecate silero-vad v3.1 in v1 --- examples/voxconverse/v1/README.md | 12 +++++++----- examples/voxconverse/v1/run.sh | 5 ++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/voxconverse/v1/README.md b/examples/voxconverse/v1/README.md index 13ec2aef..0c51b555 100644 --- a/examples/voxconverse/v1/README.md +++ b/examples/voxconverse/v1/README.md @@ -1,11 +1,13 @@ ## Overview * We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported. -* Dataset: voxconverse_dev that consists of 216 utterances -* Speaker model: ResNet34 model pretrained by wespeaker +* Dataset: Voxconverse2020 (dev: 216 utts) +* Speaker model: ResNet34 model pretrained by WeSpeaker * Refer to [voxceleb sv recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2) * [pretrained model path](https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx) -* Speaker activity detection model: oracle SAD (from ground truth annotation) or system SAD (VAD model pretrained by silero, https://github.com/snakers4/silero-vad) +* Speaker activity detection model: + * oracle SAD (from ground truth annotation) + * system SAD (VAD model pretrained by [silero-vad](https://github.com/snakers4/silero-vad), v3.1 is deprecated now) * Clustering method: spectral clustering * Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%) @@ -15,8 +17,8 @@ | system | MISS | FA | SC | DER | |:---|:---:|:---:|:---:|:---:| - | This repo (with oracle SAD) | 2.3 | 0.0 | 1.9 | 4.2 | - | This repo (with system SAD) | 3.7 | 0.8 | 2.0 | 6.5 | + | Ours (oracle SAD + spectral clustering) | 2.3 | 0.0 | 1.9 | 4.2 | + | Ours (silero-vad v3.1 + spectral clustering) | 3.7 | 0.8 | 2.0 | 6.5 | | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 | | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 | | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 | diff --git a/examples/voxconverse/v1/run.sh b/examples/voxconverse/v1/run.sh index b852ec80..f60bd3ad 100755 --- a/examples/voxconverse/v1/run.sh +++ b/examples/voxconverse/v1/run.sh @@ -29,8 +29,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools # [2] Download voice activity detection model pretrained by Silero Team - wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v3.1.zip -O external_tools/silero-vad-v3.1.zip - unzip -o external_tools/silero-vad-v3.1.zip -d external_tools + #wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v3.1.zip -O external_tools/silero-vad-v3.1.zip + #unzip -o external_tools/silero-vad-v3.1.zip -d external_tools # [3] Download ResNet34 speaker model pretrained by WeSpeaker Team mkdir -p pretrained_models @@ -79,7 +79,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [[ "x${sad_type}" == "xsystem" ]]; then # System SAD: applying 'silero' VAD python3 wespeaker/diar/make_system_sad.py \ - --repo-path external_tools/silero-vad-3.1 \ --scp data/dev/wav.scp \ --min-duration $min_duration > data/dev/system_sad fi From b0f68903b955633d5e4ffd0cbd29695a0d4f4476 Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Fri, 23 Aug 2024 18:45:54 +0800 Subject: [PATCH 2/6] [diar] 1. deprecate silero-vad v3.1 in v2; 2. add the "cluster_type" parameter and merge v3 into v2; --- examples/voxconverse/v2/README.md | 25 ++-- examples/voxconverse/v2/run.sh | 34 +++--- examples/voxconverse/v3/README.md | 34 ------ examples/voxconverse/v3/local | 1 - examples/voxconverse/v3/path.sh | 1 - examples/voxconverse/v3/run.sh | 186 ------------------------------ examples/voxconverse/v3/tools | 1 - examples/voxconverse/v3/wespeaker | 1 - 8 files changed, 33 insertions(+), 250 deletions(-) delete mode 100644 examples/voxconverse/v3/README.md delete mode 120000 examples/voxconverse/v3/local delete mode 120000 examples/voxconverse/v3/path.sh delete mode 100755 examples/voxconverse/v3/run.sh delete mode 120000 examples/voxconverse/v3/tools delete mode 120000 examples/voxconverse/v3/wespeaker diff --git a/examples/voxconverse/v2/README.md b/examples/voxconverse/v2/README.md index 02f41fa1..7a1d339a 100644 --- a/examples/voxconverse/v2/README.md +++ b/examples/voxconverse/v2/README.md @@ -1,12 +1,16 @@ ## Overview * We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported. -* Dataset: voxconverse_dev that consists of 216 utterances -* Speaker model: ResNet34 model pretrained by wespeaker +* Dataset: Voxconverse2020 (dev: 216 utts, test: 232 utts) +* Speaker model: ResNet34 model pretrained by WeSpeaker * Refer to [voxceleb sv recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2) * [pretrained model path](https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx) -* Speaker activity detection model: oracle SAD (from ground truth annotation) or system SAD (VAD model pretrained by silero, https://github.com/snakers4/silero-vad) -* Clustering method: spectral clustering +* Speaker activity detection model: + * oracle SAD (from ground truth annotation) + * system SAD (VAD model pretrained by [silero-vad](https://github.com/snakers4/silero-vad), v3.1 => v5.1) +* Clustering method: + * spectral clustering + * umap dimensionality reduction + hdbscan clustering * Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%) ## Results @@ -15,8 +19,11 @@ | system | MISS | FA | SC | DER | |:---|:---:|:---:|:---:|:---:| - | This repo (with oracle SAD) | 2.3 | 0.0 | 2.1 | 4.4 | - | This repo (with system SAD) | 3.7 | 0.8 | 2.2 | 6.8 | + | Ours (oracle SAD + spectral clustering) | 2.3 | 0.0 | 2.1 | 4.4 | + | Ours (oracle SAD + umap clustering) | 2.3 | 0.0 | 1.3 | 3.6 | + | Ours (silero-vad v3.1 + spectral clustering) | 3.7 | 0.8 | 2.2 | 6.7 | + | Ours (silero-vad v5.1 + spectral clustering) | 3.4 | 0.6 | 2.3 | 6.3 | + | Ours (silero-vad v5.1 + umap clustering) | 3.4 | 0.6 | 1.4 | 5.4 | | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 | | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 | | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 | @@ -27,7 +34,11 @@ | system | MISS | FA | SC | DER | |:---|:---:|:---:|:---:|:---:| - | This repo (with system SAD) | 4.0 | 2.4 | 3.4 | 9.8 | + | Ours (oracle SAD + spectral clustering) | 1.6 | 0.0 | 3.3 | 4.9 | + | Ours (oracle SAD + umap clustering) | 1.6 | 0.0 | 1.9 | 3.5 | + | Ours (silero-vad v3.1 + spectral clustering) | 4.0 | 2.4 | 3.4 | 9.8 | + | Ours (silero-vad v5.1 + spectral clustering) | 3.8 | 1.7 | 3.3 | 8.8 | + | Ours (silero-vad v5.1 + umap clustering) | 3.8 | 1.7 | 1.8 | 7.3 | [^1]: Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf diff --git a/examples/voxconverse/v2/run.sh b/examples/voxconverse/v2/run.sh index 8e786297..d5bcd0c4 100755 --- a/examples/voxconverse/v2/run.sh +++ b/examples/voxconverse/v2/run.sh @@ -18,8 +18,9 @@ stage=-1 stop_stage=-1 -sad_type="oracle" -partition="dev" +sad_type="oracle" # oracle/system +partition="dev" # dev/test +cluster_type="spectral" # spectral/umap # do cmn on the sub-segment or on the vad segment subseg_cmn=true @@ -36,11 +37,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then wget -c https://github.com/usnistgov/SCTK/archive/refs/tags/v2.4.12.zip -O external_tools/SCTK-v2.4.12.zip unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools - # [2] Download voice activity detection model pretrained by Silero Team - wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v3.1.zip -O external_tools/silero-vad-v3.1.zip - unzip -o external_tools/silero-vad-v3.1.zip -d external_tools - - # [3] Download ResNet34 speaker model pretrained by WeSpeaker Team + # [2] Download ResNet34 speaker model pretrained by WeSpeaker Team mkdir -p pretrained_models wget -c https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx -O pretrained_models/voxceleb_resnet34_LM.onnx @@ -101,7 +98,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [[ "x${sad_type}" == "xsystem" ]]; then # System SAD: applying 'silero' VAD python3 wespeaker/diar/make_system_sad.py \ - --repo-path external_tools/silero-vad-3.1 \ --scp data/${partition}/wav.scp \ --min-duration $min_duration > data/${partition}/system_sad fi @@ -144,24 +140,24 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then fi -# Applying spectral clustering algorithm +# Applying spectral or ump+hdbscan clustering algorithm if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - [ -f "exp/spectral_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/spectral_cluster/${partition}_${sad_type}_sad_labels + [ -f "exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels - echo "Doing spectral clustering and store the result in exp/spectral_cluster/${partition}_${sad_type}_sad_labels" + echo "Doing ${cluster_type} clustering and store the result in exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels" echo "..." - python3 wespeaker/diar/spectral_clusterer.py \ + python3 wespeaker/diar/${cluster_type}_clusterer.py \ --scp exp/${partition}_${sad_type}_sad_embedding/emb.scp \ - --output exp/spectral_cluster/${partition}_${sad_type}_sad_labels + --output exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels fi # Convert labels to RTTMs if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then python3 wespeaker/diar/make_rttm.py \ - --labels exp/spectral_cluster/${partition}_${sad_type}_sad_labels \ - --channel 1 > exp/spectral_cluster/${partition}_${sad_type}_sad_rttm + --labels exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels \ + --channel 1 > exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm fi @@ -173,18 +169,18 @@ if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \ -c 0.25 \ -r <(cat ${ref_dir}/${partition}/*.rttm) \ - -s exp/spectral_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/spectral_cluster/${partition}_${sad_type}_sad_res + -s exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_res if [ ${get_each_file_res} -eq 1 ];then - single_file_res_dir=exp/spectral_cluster/${partition}_${sad_type}_single_file_res + single_file_res_dir=exp/${cluster_type}_cluster/${partition}_${sad_type}_single_file_res mkdir -p $single_file_res_dir echo -e "\nGet the DER results for each file and the results will be stored underd ${single_file_res_dir}\n..." - awk '{print $2}' exp/spectral_cluster/${partition}_${sad_type}_sad_rttm | sort -u | while read file_name; do + awk '{print $2}' exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm | sort -u | while read file_name; do perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \ -c 0.25 \ -r <(cat ${ref_dir}/${partition}/${file_name}.rttm) \ - -s <(grep "${file_name}" exp/spectral_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res + -s <(grep "${file_name}" exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res done echo "Done!" fi diff --git a/examples/voxconverse/v3/README.md b/examples/voxconverse/v3/README.md deleted file mode 100644 index 5b333714..00000000 --- a/examples/voxconverse/v3/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## Overview - -* We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported. -* Dataset: voxconverse_dev that consists of 216 utterances -* Speaker model: ResNet34 model pretrained by wespeaker - * Refer to [voxceleb sv recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2) - * [pretrained model path](https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx) -* Speaker activity detection model: oracle SAD (from ground truth annotation) or system SAD (VAD model pretrained by silero, https://github.com/snakers4/silero-vad) -* Clustering method: umap dimensionality reduction + hdbscan clustering -* Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%) - -## Results - -* Dev set - - | system | MISS | FA | SC | DER | - |:---|:---:|:---:|:---:|:---:| - | This repo (with oracle SAD) | 2.3 | 0.0 | 1.3 | 3.6 | - | This repo (with system SAD) | 3.4 | 0.6 | 1.4 | 5.4 | - | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 | - | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 | - | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 | - | (AVSE ASD only) [^1] | 2.0 | 5.9 | 4.6 | 12.4 | - | (proposed) [^1] | 2.4 | 2.3 | 3.0 | 7.7 | - -* Test set - - | system | MISS | FA | SC | DER | - |:---|:---:|:---:|:---:|:---:| - | This repo (with oracle SAD) | 1.6 | 0.0 | 1.9 | 3.5 | - | This repo (with system SAD) | 3.8 | 1.7 | 1.8 | 7.4 | - - -[^1]: Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf diff --git a/examples/voxconverse/v3/local b/examples/voxconverse/v3/local deleted file mode 120000 index 8b1d5f97..00000000 --- a/examples/voxconverse/v3/local +++ /dev/null @@ -1 +0,0 @@ -../v2/local \ No newline at end of file diff --git a/examples/voxconverse/v3/path.sh b/examples/voxconverse/v3/path.sh deleted file mode 120000 index b6a713c8..00000000 --- a/examples/voxconverse/v3/path.sh +++ /dev/null @@ -1 +0,0 @@ -../v2/path.sh \ No newline at end of file diff --git a/examples/voxconverse/v3/run.sh b/examples/voxconverse/v3/run.sh deleted file mode 100755 index f53cfab5..00000000 --- a/examples/voxconverse/v3/run.sh +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/bash -# Copyright (c) 2022-2023 Xu Xiang -# 2022 Zhengyang Chen (chenzhengyang117@gmail.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -. ./path.sh || exit 1 - -stage=-1 -stop_stage=-1 -sad_type="oracle" -partition="dev" - -# do cmn on the sub-segment or on the vad segment -subseg_cmn=true -# whether print the evaluation result for each file -get_each_file_res=1 - -. tools/parse_options.sh - -# Prerequisite -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - mkdir -p external_tools - - # [1] Download evaluation toolkit - wget -c https://github.com/usnistgov/SCTK/archive/refs/tags/v2.4.12.zip -O external_tools/SCTK-v2.4.12.zip - unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools - - # [3] Download ResNet34 speaker model pretrained by WeSpeaker Team - mkdir -p pretrained_models - - wget -c https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx -O pretrained_models/voxceleb_resnet34_LM.onnx -fi - - -# Download VoxConverse dev/test audios and the corresponding annotations -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - mkdir -p data - - # Download annotations for dev and test sets (version 0.0.3) - wget -c https://github.com/joonson/voxconverse/archive/refs/heads/master.zip -O data/voxconverse_master.zip - unzip -o data/voxconverse_master.zip -d data - - # Download annotations from VoxSRC-23 validation toolkit (looks like version 0.0.2) - # cd data && git clone https://github.com/JaesungHuh/VoxSRC2023.git --recursive && cd - - - # Download dev audios - mkdir -p data/dev - - #wget --no-check-certificate -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_dev_wav.zip -O data/voxconverse_dev_wav.zip - # The above url may not be reachable, you can try the link below. - # This url is from https://github.com/joonson/voxconverse/blob/master/README.md - wget --no-check-certificate -c https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_dev_wav.zip -O data/voxconverse_dev_wav.zip - unzip -o data/voxconverse_dev_wav.zip -d data/dev - - # Create wav.scp for dev audios - ls `pwd`/data/dev/audio/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/dev/wav.scp - - # Test audios - mkdir -p data/test - - #wget --no-check-certificate -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip - # The above url may not be reachable, you can try the link below. - # This url is from https://github.com/joonson/voxconverse/blob/master/README.md - wget --no-check-certificate -c https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip - unzip -o data/voxconverse_test_wav.zip -d data/test - - # Create wav.scp for test audios - ls `pwd`/data/test/voxconverse_test_wav/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/test/wav.scp -fi - - -# Voice activity detection -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Set VAD min duration - min_duration=0.255 - - if [[ "x${sad_type}" == "xoracle" ]]; then - # Oracle SAD: handling overlapping or too short regions in ground truth RTTM - while read -r utt wav_path; do - python3 wespeaker/diar/make_oracle_sad.py \ - --rttm data/voxconverse-master/${partition}/${utt}.rttm \ - --min-duration $min_duration - done < data/${partition}/wav.scp > data/${partition}/oracle_sad - fi - - if [[ "x${sad_type}" == "xsystem" ]]; then - # System SAD: applying 'silero' VAD - python3 wespeaker/diar/make_system_sad.py \ - --scp data/${partition}/wav.scp \ - --min-duration $min_duration > data/${partition}/system_sad - fi -fi - - -# Extract fbank features -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - - [ -d "exp/${sad_type}_sad_fbank" ] && rm -r exp/${sad_type}_sad_fbank - - echo "Make Fbank features and store it under exp/${sad_type}_sad_fbank" - echo "..." - bash local/make_fbank.sh \ - --scp data/${partition}/wav.scp \ - --segments data/${partition}/${sad_type}_sad \ - --store_dir exp/${partition}_${sad_type}_sad_fbank \ - --subseg_cmn ${subseg_cmn} \ - --nj 24 -fi - -# Extract embeddings -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - - [ -d "exp/${sad_type}_sad_embedding" ] && rm -r exp/${sad_type}_sad_embedding - - echo "Extract embeddings and store it under exp/${sad_type}_sad_embedding" - echo "..." - bash local/extract_emb.sh \ - --scp exp/${partition}_${sad_type}_sad_fbank/fbank.scp \ - --pretrained_model pretrained_models/voxceleb_resnet34_LM.onnx \ - --device cuda \ - --store_dir exp/${partition}_${sad_type}_sad_embedding \ - --batch_size 96 \ - --frame_shift 10 \ - --window_secs 1.5 \ - --period_secs 0.75 \ - --subseg_cmn ${subseg_cmn} \ - --nj 1 -fi - - -# Applying umap clustering algorithm -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - - [ -f "exp/umap_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/umap_cluster/${partition}_${sad_type}_sad_labels - - echo "Doing umap clustering and store the result in exp/umap_cluster/${partition}_${sad_type}_sad_labels" - echo "..." - python3 wespeaker/diar/umap_clusterer.py \ - --scp exp/${partition}_${sad_type}_sad_embedding/emb.scp \ - --output exp/umap_cluster/${partition}_${sad_type}_sad_labels -fi - - -# Convert labels to RTTMs -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - python3 wespeaker/diar/make_rttm.py \ - --labels exp/umap_cluster/${partition}_${sad_type}_sad_labels \ - --channel 1 > exp/umap_cluster/${partition}_${sad_type}_sad_rttm -fi - - -# Evaluate the result -if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then - ref_dir=data/voxconverse-master/ - #ref_dir=data/VoxSRC2023/voxconverse/ - echo -e "Get the DER results\n..." - perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \ - -c 0.25 \ - -r <(cat ${ref_dir}/${partition}/*.rttm) \ - -s exp/umap_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/umap_cluster/${partition}_${sad_type}_sad_res - - if [ ${get_each_file_res} -eq 1 ];then - single_file_res_dir=exp/umap_cluster/${partition}_${sad_type}_single_file_res - mkdir -p $single_file_res_dir - echo -e "\nGet the DER results for each file and the results will be stored underd ${single_file_res_dir}\n..." - - awk '{print $2}' exp/umap_cluster/${partition}_${sad_type}_sad_rttm | sort -u | while read file_name; do - perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \ - -c 0.25 \ - -r <(cat ${ref_dir}/${partition}/${file_name}.rttm) \ - -s <(grep "${file_name}" exp/umap_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res - done - echo "Done!" - fi -fi diff --git a/examples/voxconverse/v3/tools b/examples/voxconverse/v3/tools deleted file mode 120000 index c92f4172..00000000 --- a/examples/voxconverse/v3/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools \ No newline at end of file diff --git a/examples/voxconverse/v3/wespeaker b/examples/voxconverse/v3/wespeaker deleted file mode 120000 index 900c560b..00000000 --- a/examples/voxconverse/v3/wespeaker +++ /dev/null @@ -1 +0,0 @@ -../../../wespeaker \ No newline at end of file From 9da681e9ebbcee7257470c660f5c756f522a9b42 Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Fri, 23 Aug 2024 18:54:29 +0800 Subject: [PATCH 3/6] [diar] update README.md for voxconverse repices --- examples/voxconverse/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/voxconverse/README.md b/examples/voxconverse/README.md index 3fb5772b..ab20d24c 100644 --- a/examples/voxconverse/README.md +++ b/examples/voxconverse/README.md @@ -1,3 +1,7 @@ This is a **WeSpeaker** speaker diarization recipe on the Voxconverse 2020 dataset. It focused on a ``in the wild`` scenario, which was collected from YouTube videos with a semi-automatic pipeline and released for the diarization track in VoxSRC 2020 Challenge. See https://www.robots.ox.ac.uk/~vgg/data/voxconverse/ for more detailed information. Two recipes are provided, including **v1** and **v2**. Their only difference is that in **v2**, we split the Fbank extraction, embedding extraction and clustering modules to different stages. We recommend newcomers to follow the **v2** recipe and run it stage by stage. + +🔥 UPDATE 2024.08.20: +* silero-vad v5.1 is used in place of v3.1 +* umap dimensionality reduction + hdbscan clustering is also supported \ No newline at end of file From 67bc9128008e9be23de30ded2bfc40ac2cf0805a Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Fri, 23 Aug 2024 19:00:59 +0800 Subject: [PATCH 4/6] [diar] update README.md --- examples/voxconverse/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/voxconverse/README.md b/examples/voxconverse/README.md index ab20d24c..85af1c2d 100644 --- a/examples/voxconverse/README.md +++ b/examples/voxconverse/README.md @@ -4,4 +4,4 @@ Two recipes are provided, including **v1** and **v2**. Their only difference is 🔥 UPDATE 2024.08.20: * silero-vad v5.1 is used in place of v3.1 -* umap dimensionality reduction + hdbscan clustering is also supported \ No newline at end of file +* umap dimensionality reduction + hdbscan clustering is also supported in v2 From 418e98128b1684274e1e1e7edd17dd24b2ae18e5 Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Fri, 23 Aug 2024 19:06:09 +0800 Subject: [PATCH 5/6] [diar] update voxconverse/v2/run.sh --- examples/voxconverse/v2/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/voxconverse/v2/run.sh b/examples/voxconverse/v2/run.sh index d5bcd0c4..6c83171c 100755 --- a/examples/voxconverse/v2/run.sh +++ b/examples/voxconverse/v2/run.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright (c) 2022-2023 Xu Xiang # 2022 Zhengyang Chen (chenzhengyang117@gmail.com) +# 2024 Hongji Wang (jijijiang77@gmail.com) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 555bde300b609aa5a3eda228d406e258cb861bb7 Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Fri, 23 Aug 2024 19:09:49 +0800 Subject: [PATCH 6/6] [docs] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index afb05573..690e93e8 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ pre-commit install # for clean and tidy code ``` ## 🔥 News -* 2024.08.20: Update diarization recipe for VoxConverse dataset by leveraging umap dimensionality reduction and hdbscan clustering, see [#347](https://github.com/wenet-e2e/wespeaker/pull/347). +* 2024.08.20: Update diarization recipe for VoxConverse dataset by leveraging umap dimensionality reduction and hdbscan clustering, see [#347](https://github.com/wenet-e2e/wespeaker/pull/347) and [#352](https://github.com/wenet-e2e/wespeaker/pull/352). * 2024.08.18: Support using ssl pre-trained models as the frontend. The [WavLM recipe](https://github.com/wenet-e2e/wespeaker/blob/master/examples/voxceleb/v2/run_wavlm.sh) is also provided, see [#344](https://github.com/wenet-e2e/wespeaker/pull/344). * 2024.05.15: Add support for [quality-aware score calibration](https://arxiv.org/pdf/2211.00815), see [#320](https://github.com/wenet-e2e/wespeaker/pull/320). * 2024.04.25: Add support for the gemini-dfresnet model, see [#291](https://github.com/wenet-e2e/wespeaker/pull/291).