diff --git a/examples/sre/v3/README b/examples/sre/v3/README new file mode 100644 index 00000000..da92aa03 --- /dev/null +++ b/examples/sre/v3/README @@ -0,0 +1,29 @@ +Changed a little in make_system_sad.py to make split a large data set in parts +when extracting VAD. It took ages to start otherwise and this will also be +helpful in case there is a crash since output is saved after each part instead +of after the whole set. + +# We use some scripts from Kaldi (combine_data.sh and fix_data_dir.sh) + +# This should not be needed anymore. +# ln -s $KALDI_ROOT/egs/wsj/s5/utils +# export PATH=$PATH:$(pwd)/utils/ # This is necessary since some Kaldi scripts assume other Kaldi scripts exists in the path. +#export PATH=$PATH:$KALDI_ROOT/ + + +CTS + spk / utt +Org. data 6867 / 605760 +After VAD 6867 / 605704 +After removing T < 5s 6867 / 604774 +After removing utt/spk < 3 6867 / 604774 + +VOX + spk / utt +Org. data 7245 / 1245525 +After VAD 7245 / 1245469 +After removing T < 5s 7245 / 816385 +After removing utt/spk < 3 7245 / 816385 + +Total +After removing utt/spk < 3 14112 / 1421159 \ No newline at end of file diff --git a/examples/sre/v3/README.md b/examples/sre/v3/README.md new file mode 100644 index 00000000..2482cddd --- /dev/null +++ b/examples/sre/v3/README.md @@ -0,0 +1,99 @@ +### Main differences from ../v2 +* The training data is the CTS superset plus VoxCeleb with GSM codec +* The test data is SRE16, SRE18, and SRE21 +* Preprocessing of embeddings before backend/scoring is supported + +### Important +Similarly to ../v2, this recipe uses silero vad https://github.com/snakers4/silero-vad +downloaded from here https://github.com/snakers4/silero-vad/archive/refs/tags/v4.0.zip +If you intended to use this recipe for an evaluation/competition, make sure to check that +it is allowed to use the data that has been used to train Silero. + +### Instructions +* Set the paths in stage 1. The variable ```sre_data_dir``` is assumed to be prepared by + Kaldi (https://github.com/kaldi-asr/kaldi/tree/master/egs/sre16/v2). + Only the eval and unlabeled (major) data of sre16 is taken from there. + ```voxceleb_dir``` is the path to voxceleb prepared by wespeaker (```../../voxceleb/v2```). + If you set it to "" (empty string), the preparation will be run here. For the other datasets, + the path to the folder provided by LDA should be provided. The relevant LDC numbers and + file names of the data can be seen in the script. If you don't have + one or two of the "eval/dev" sets of "sre16", "sre18" or "sre21" and not specify it, you may + have to comment it from some more places in order to avoided crashes. (Eventually + the script will hopefully be made more robust to this.) + If you don't have the CTS superset data, you can skip stage 5 in ```local/prepare_data.sh``` + and instead replace the CTS data it with some other data, e.g., the training data prepared in ```../v2``` + If so, it is probably the easiest to name this data "CTS" since this name is assumed later + in the recipe. +* Select which torchrun command to use in stage 3. The first line + (currently commented) is for "single-node, multi-worker" (one + pytorch job per machine). The second line is for "Stacked + single-node multi-worker" (more than one pytorch job may be + submitted to the same node in your cluster.) See + https://pytorch.org/docs/stable/elastic/run.html for explanations. +* Stage 3 (training) and stage 4 (embedding extraction) need GPU. You may have + to arrange how to run these parts based on your environment. + + +### Explanation of embedding processing + +The code supports flexible combinations of embedding processing steps, such as length-norm and LDA. +A processing chain is specified e.g., as follows +``` +mean-subtract --scp $mean1_scp | length-norm | lda --scp $lda_scp --utt2spk $utt2spk --dim $lda_dim | length-norm" +``` +The script ```wespeaker/bin/prep_embd_proc.py``` takes such a processing chain as input, loops through the processing steps (separated by ```|```), calculates +the necessary processing parameters (means, lda transforms etc.) and stores the whole processing chain with parameters in +pickle format. The parameters for each step will be calculated sequentially and the data specified for the parameter estimation of a step will +be processed by the earlier steps. Therefore the data for the different steps can be different. For example when estimating LDA in the above chain, the data given by ```$lda_scp``` will first be processed by ```mean-subtract``` whose parameters were estimated by ```$mean1_scp``` which could be a different dataset. +In scenarios where unlabeled domain adaptation data is available, we want to use this data for the first mean subtraction while still using the out domain data for LDA estimation. This CANNOT be achieved by specifying the processing chain +``` +mean-subtract --scp $indomain_scp | length-norm | lda --scp $lda_scp --utt2spk $utt2spk --dim $lda_dim | length-norm +``` +since this would have the consequence that in LDA estimation, the data (```$lda_scp```) would be subjected to mean subtraction +using the mean of the indomain data (```$indomain_scp```). To solve this, we have an additional script ```wespeaker/bin/update_embd_proc.py``` used as follows +``` +new_link="mean-subtract --scp $indomain_scp" +python wespeaker/bin/update_embd_proc.py --in_path $preprocessing_path_cts_aug --out_path $preprocessing_path_sre18_unlab --link_no_to_remove 0 --new_link "$new_link" +``` +where ```$preprocessing_path_cts_aug``` is the path to the pickled original processing chain and ```$preprocessing_path_sre18_unlab``` is the path to the new pickled processing chain. +The script will remove link 0, e.g. ```mean-subtract --scp $mean1_scp``` and replace it with ```mean-subtract --scp $indomain_scp```. + + +### Regarding extractor training data pruning + +Similarly to ```../v2``` and Kaldi's sre16 recipe, we discard some of the training utterances based on duration as well as training speakers based on their number of utterances. +This is controlled in stage 9 of ```local/prepare_data.sh```. It is quite flexible but currently a bit messy and some consequences of the settings are not obvious. Therefore some explanation is provided here. +There are three "blocks" in stage 9: +* The first block discards all utterances shorter or equal to some specified duration (currently set to 5s) according to VOICED DURATION. +* The second block discards all utterances shorter or equal to some specified duration (currently set to 5s) according to TOTAL DURATION, i.e., ignoring VAD info. +* The third block discards all speakers that has less than or equal to a specified number of utterances. (Currently set to 2, i.e. speaker with 3 or more utterances are kept.) +It is possible to set the thresholds differently for the different sets. IMPORTANT: The pruning in block 1 is based on ```data/data_set_name/utt2voice_dur``` which is calculated +from the VAD info, so if a recording does not have any speech, it will not be present in utt2voice_dur and therefore discarded in this block even if the duration threshold is +set to e.g. -1. If we want such utterances to be kept for one set we should not run this block for the set (as currently is the case for voxceleb). The current setup is as follows: + 1. Apply block one to CTS but not Voxceleb + 2. Apply block two to Voxceleb but not CTS. (Applying this stage to CTS would not have an effect if the thresholds are the same since the total duration is always larger or equal to the voiced duration.) + 3. Apply stage three to both CTS and VoxCeleb. + + This means Voxceleb recordings are kept even if they have no speech accordng to VAD. The later shard creation stage applies VAD if available, otherwise keeps the file as it is. So Voxceleb recording with no speech according to VAD will NOT be discarded (but there are only around 70 of them which is unlikely to have any effect on the trained system.). Also, there is a risk that pruning according to total duration while applying VAD in shard creation could result in recordings shorter than "num_frms". These will be zero padded at training time so there will be no crash but this is probably also suboptimal. +These is setting are arguably somewhat weird. Applying block one also to voxceleb (and not using block two at all) would be more reasonable but it seems to degrade the performance due to discarding too many files. A better solution than the current would be to try with smaller thresholds than 5s but we have had not had time to explore this yet. Also, it would be reasonable to discard recordings with no speech according to VAD in the shard creation stage. However, when no VAD is available for a file, the shard creation code does not know whether this is because no speech was detected for this file according to VAD, or because VAD was not ran for this file. Since we want to have the possibility to keep recordings for which the latter is the case, we have it this way (it could for example be considered not to use VAD for voxceleb at all, in which case we need to avoid discarding these files at the shard creation stage). A more flexible and clear solution is needed and we will work on this for future updates. + + +### Some data statistics +| | CTS #utt | CTS #spk | CTS #utt | CTS #spk | comment| +| --- | --- | --- | --- | --- | --- | +|Original data | 605760 | 6867 | 1245525 | 7245 | | +|exclud recording with nospeech acording to VAD| 605704 | 6867 | 1245455 | 7245 | VAD is a bit random so these numbers could vary slightly, especially for voxceleb. | +|After filtering according voiced duration | 604774 | 6867 | 816411 | 7245 | Accordingly, here too. We don't use this for voxceleb in the current settings. | +|After filtering according total duration | - | - | 868326 | 7245 | Haven't checked this for CTS. + +No speaker are discarded in block three with the current setting. + + +### Things to explore +Very few things have been tuned. For example the following could be low-hanging fruits: +* The above mentioned pruning rules +* Utterance durations of the training segments. +* Shall voxceleb be included? Is applying the GSM codec a good idea? (Note that GSM codec is applied in the data preparation stage while augmentation is applied at training time, i.e, GSM codec comes before augmentations. This is not so realistic, since in reality noise and reverberation comes before the data is recorded and encoded. However, it is consistent with CTS where we also apply augmentations at the already encoded audio since it was encoded at recording time.) +* The other architectures. + +We will tune this futher in the future. We are also happy to hear about any such results obtained by others. \ No newline at end of file diff --git a/examples/sre/v3/conf/resnet.yaml b/examples/sre/v3/conf/resnet.yaml new file mode 100644 index 00000000..83294c06 --- /dev/null +++ b/examples/sre/v3/conf/resnet.yaml @@ -0,0 +1,81 @@ +### train configuration + +exp_dir: exp/ResNet34-TSTP-emb256-fbank40-num_frms200-aug0.6-spFalse-saFalse-Softmax-SGD-epoch150 +gpus: "[0,1]" +num_avg: 10 +enable_amp: False # whether enable automatic mixed precision training + +seed: 42 +num_epochs: 150 +save_epoch_interval: 5 # save model every 5 epochs +log_batch_interval: 100 # log every 100 batchs + +dataloader_args: + batch_size: 256 + num_workers: 7 # Total number of cores will be (this +1)*num_gpus + pin_memory: False + prefetch_factor: 8 + drop_last: True + +dataset_args: + # the sample number which will be traversed within one epoch, if the value equals to 0, + # the utterance number in the dataset will be used as the sample_num_per_epoch. + sample_num_per_epoch: 780000 + shuffle: True + shuffle_args: + shuffle_size: 1500 + filter: True + filter_args: + min_num_frames: 100 + max_num_frames: 300 + resample_rate: 8000 + speed_perturb: False + num_frms: 200 + aug_prob: 0.6 # prob to add reverb & noise aug per sample + fbank_args: + num_mel_bins: 64 + frame_shift: 10 + frame_length: 25 + dither: 1.0 + spec_aug: False + spec_aug_args: + num_t_mask: 1 + num_f_mask: 1 + max_t: 10 + max_f: 8 + prob: 0.6 + +model: ResNet34 # ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 +model_init: null +model_args: + feat_dim: 64 + embed_dim: 256 + pooling_func: "TSTP" # TSTP, ASTP, MQMHASTP + two_emb_layer: False +projection_args: + project_type: "softmax" # add_margin, arc_margin, sphere, softmax, arc_margin_intertopk_subcenter + +margin_scheduler: MarginScheduler +margin_update: + initial_margin: 0.0 + final_margin: 0.2 + increase_start_epoch: 20 + fix_start_epoch: 40 + update_margin: True + increase_type: "exp" # exp, linear + +loss: CrossEntropyLoss +loss_args: {} + +optimizer: SGD +optimizer_args: + momentum: 0.9 + nesterov: True + weight_decay: 0.0001 + +scheduler: ExponentialDecrease +scheduler_args: + initial_lr: 0.1 + final_lr: 0.00005 + warm_up_epoch: 6 + warm_from_zero: True diff --git a/examples/sre/v3/local/create_preproc_embd_lists.sh b/examples/sre/v3/local/create_preproc_embd_lists.sh new file mode 100755 index 00000000..8f868ec0 --- /dev/null +++ b/examples/sre/v3/local/create_preproc_embd_lists.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +# Copyright (c) 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# The preprocessed embeddings are already stored but we need to create the lists +# as score.sh wants them. + +exp_dir=$1 +data=data + +# We have three different preprocessors for which we need to prepare the lists +# embd_proc_cts_aug.pkl # LDA and cts_aug mean subtraction +# embd_proc_sre16_major.pkl # LDA and sre16_major mean subtracion (Only used for SRE16) +# embd_proc_sre18_dev_unlabeled.pkl # LDA and sre18_dev_unlabeled mean subtracion (Only used for SRE18) + + +### !!! +# Note that xvector2 is only a hack for BUT + +################################################################## +# CTS AUG for all sets +echo "mean vector of enroll" +python tools/vector_mean.py \ + --spk2utt ${data}/sre16/eval/enrollment/spk2utt \ + --xvector_scp $exp_dir/embeddings/sre16/eval/enrollment/xvector_proc_embd_proc_cts_aug.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre16/eval/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre18/dev/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre18/dev/enrollment/xvector_proc_embd_proc_cts_aug.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre18/dev/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre18/eval/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre18/eval/enrollment/xvector_proc_embd_proc_cts_aug.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre18/eval/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre21/dev/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre21/dev/enrollment/xvector_proc_embd_proc_cts_aug.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre21/dev/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre21/eval/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre21/eval/enrollment/xvector_proc_embd_proc_cts_aug.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre21/eval/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.ark + + +# Create one scp with both enroll and test since this is expected by some scripts +cat ${exp_dir}/embeddings/sre16/eval/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.scp \ + ${exp_dir}/embeddings/sre16/eval/test/xvector_proc_embd_proc_cts_aug.scp \ + > ${exp_dir}/embeddings/sre16/eval/xvector_proc_embd_proc_cts_aug.scp + +cat ${exp_dir}/embeddings/sre18/dev/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.scp \ + ${exp_dir}/embeddings/sre18/dev/test/xvector_proc_embd_proc_cts_aug.scp \ + > ${exp_dir}/embeddings/sre18/dev/xvector_proc_embd_proc_cts_aug.scp + +cat ${exp_dir}/embeddings/sre18/eval/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.scp \ + ${exp_dir}/embeddings/sre18/eval/test/xvector_proc_embd_proc_cts_aug.scp \ + > ${exp_dir}/embeddings/sre18/eval/xvector_proc_embd_proc_cts_aug.scp + +cat ${exp_dir}/embeddings/sre21/dev/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.scp \ + ${exp_dir}/embeddings/sre21/dev/test/xvector_proc_embd_proc_cts_aug.scp \ + > ${exp_dir}/embeddings/sre21/dev/xvector_proc_embd_proc_cts_aug.scp + +cat ${exp_dir}/embeddings/sre21/eval/enrollment/enroll_spk_xvector_proc_embd_proc_cts_aug.scp \ + ${exp_dir}/embeddings/sre21/eval/test/xvector_proc_embd_proc_cts_aug.scp \ + > ${exp_dir}/embeddings/sre21/eval/xvector_proc_embd_proc_cts_aug.scp + + +################################################################## +# sre16_major for sre16 eval +echo "mean vector of enroll" +python tools/vector_mean.py \ + --spk2utt ${data}/sre16/eval/enrollment/spk2utt \ + --xvector_scp $exp_dir/embeddings/sre16/eval/enrollment/xvector_proc_embd_proc_sre16_major.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre16/eval/enrollment/enroll_spk_xvector_proc_embd_proc_sre16_major.ark + +# Create one scp with both enroll and test since this is expected by some scripts +cat ${exp_dir}/embeddings/sre16/eval/enrollment/enroll_spk_xvector_proc_embd_proc_sre16_major.scp \ + ${exp_dir}/embeddings/sre16/eval/test/xvector_proc_embd_proc_sre16_major.scp \ + > ${exp_dir}/embeddings/sre16/eval/xvector_proc_embd_proc_sre16_major.scp + + +################################################################## +# sre18_dev_unlabeled for sre18 dev/eval +echo "mean vector of enroll" +python tools/vector_mean.py \ + --spk2utt ${data}/sre18/dev/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre18/dev/enrollment/xvector_proc_embd_proc_sre18_dev_unlabeled.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre18/dev/enrollment/enroll_spk_xvector_proc_embd_proc_sre18_dev_unlabeled.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre18/eval/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre18/eval/enrollment/xvector_proc_embd_proc_sre18_dev_unlabeled.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre18/eval/enrollment/enroll_spk_xvector_proc_embd_proc_sre18_dev_unlabeled.ark + +# Create one scp with both enroll and test since this is expected by some scripts +cat ${exp_dir}/embeddings/sre18/dev/enrollment/enroll_spk_xvector_proc_embd_proc_sre18_dev_unlabeled.scp \ + ${exp_dir}/embeddings/sre18/dev/test/xvector_proc_embd_proc_sre18_dev_unlabeled.scp \ + > ${exp_dir}/embeddings/sre18/dev/xvector_proc_embd_proc_sre18_dev_unlabeled.scp + +cat ${exp_dir}/embeddings/sre18/eval/enrollment/enroll_spk_xvector_proc_embd_proc_sre18_dev_unlabeled.scp \ + ${exp_dir}/embeddings/sre18/eval/test/xvector_proc_embd_proc_sre18_dev_unlabeled.scp \ + > ${exp_dir}/embeddings/sre18/eval/xvector_proc_embd_proc_sre18_dev_unlabeled.scp + diff --git a/examples/sre/v3/local/download_data.sh b/examples/sre/v3/local/download_data.sh new file mode 120000 index 00000000..b02d74af --- /dev/null +++ b/examples/sre/v3/local/download_data.sh @@ -0,0 +1 @@ +../../../voxceleb/v2/local/download_data.sh \ No newline at end of file diff --git a/examples/sre/v3/local/extract_sre.sh b/examples/sre/v3/local/extract_sre.sh new file mode 100755 index 00000000..f3e053c7 --- /dev/null +++ b/examples/sre/v3/local/extract_sre.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +# Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com) +# 2023 Zhengyang Chen (chenzhengyang117@gmail.com) +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +exp_dir='' +model_path='' +nj=4 +gpus="[0,1]" +data_type="shard" # shard/raw/feat +data=data +reverb_data=data/rirs/lmdb +noise_data=data/musan/lmdb +aug_plda_data=0 + +. tools/parse_options.sh +set -e + + + +#### +true && { +data_name_array=( + "cts_aug" + "sre16/major" + "sre16/eval/enrollment" + "sre16/eval/test" + "sre18/dev/enrollment/" + "sre18/dev/test/" + "sre18/dev/unlabeled/" + "sre18/eval/enrollment/" + "sre18/eval/test/" + "sre21/dev/enrollment/" + "sre21/dev/test/" + "sre21/eval/enrollment/" + "sre21/eval/test/" +) +data_list_path_array=( + "${data}/cts_aug/${data_type}.list" + "${data}/sre16/major/${data_type}.list" + "${data}/sre16/eval/enrollment/${data_type}.list" + "${data}/sre16/eval/test/${data_type}.list" + "${data}/sre18/dev/enrollment/${data_type}.list" + "${data}/sre18/dev/test/${data_type}.list" + "${data}/sre18/dev/unlabeled/${data_type}.list" + "${data}/sre18/eval/enrollment/${data_type}.list" + "${data}/sre18/eval/test/${data_type}.list" + "${data}/sre21/dev/enrollment/${data_type}.list" + "${data}/sre21/dev/test/${data_type}.list" + "${data}/sre21/eval/enrollment/${data_type}.list" + "${data}/sre21/eval/test/${data_type}.list" +) +data_scp_path_array=( + "${data}/cts_aug/wav.scp" + "${data}/sre16/major/wav.scp" + "${data}/sre16/eval/enrollment/wav.scp" + "${data}/sre16/eval/test/wav.scp" + "${data}/sre18/dev/enrollment/wav.scp" + "${data}/sre18/dev/test/wav.scp" + "${data}/sre18/dev/unlabeled/wav.scp" + "${data}/sre18/eval/enrollment/wav.scp" + "${data}/sre18/eval/test/wav.scp" + "${data}/sre21/dev/enrollment/wav.scp" + "${data}/sre21/dev/test/wav.scp" + "${data}/sre21/eval/enrollment/wav.scp" + "${data}/sre21/eval/test/wav.scp" +) # to count the number of wavs +nj_array=($nj $nj $nj $nj $nj $nj $nj $nj $nj $nj $nj $nj $nj) +batch_size_array=(1 1 1 1 1 1 1 1 1 1 1 1 1) # batch_size of test set must be 1 !!! +num_workers_array=(1 1 1 1 1 1 1 1 1 1 1 1 1) +aug_prob_array=(0.67 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0) +} + + +count=${#data_name_array[@]} + +true && { +for i in $(seq 0 $(($count - 1))); do + echo $i + wavs_num=$(wc -l ${data_scp_path_array[$i]} | awk '{print $1}') + bash tools/extract_embedding.sh --exp_dir ${exp_dir} \ + --model_path $model_path \ + --data_type ${data_type} \ + --data_list ${data_list_path_array[$i]} \ + --wavs_num ${wavs_num} \ + --store_dir ${data_name_array[$i]} \ + --batch_size ${batch_size_array[$i]} \ + --num_workers ${num_workers_array[$i]} \ + --aug_prob ${aug_prob_array[$i]} \ + --reverb_data ${reverb_data} \ + --noise_data ${noise_data} \ + --nj ${nj_array[$i]} \ + --gpus $gpus +done + +wait +} + +# Create enrollment models. This is the first order statistics. The zeroth order +# (the number of enrollment vectors) should, in principle, also be considered. +echo "mean vector of enroll" +python tools/vector_mean.py \ + --spk2utt ${data}/sre16/eval/enrollment/spk2utt \ + --xvector_scp $exp_dir/embeddings/sre16/eval/enrollment/xvector.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre16/eval/enrollment/enroll_spk_xvector.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre18/dev/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre18/dev/enrollment/xvector.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre18/dev/enrollment/enroll_mdl_xvector.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre18/eval/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre18/eval/enrollment/xvector.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre18/eval/enrollment/enroll_mdl_xvector.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre21/dev/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre21/dev/enrollment/xvector.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre21/dev/enrollment/enroll_mdl_xvector.ark + +python tools/vector_mean.py \ + --spk2utt ${data}/sre21/eval/enrollment/mdl_id2utt \ + --xvector_scp $exp_dir/embeddings/sre21/eval/enrollment/xvector.scp \ + --spk_xvector_ark $exp_dir/embeddings/sre21/eval/enrollment/enroll_mdl_xvector.ark + + +# Create one scp with both enroll and test since this is expected by some scripts +cat ${exp_dir}/embeddings/sre16/eval/enrollment/enroll_spk_xvector.scp \ + ${exp_dir}/embeddings/sre16/eval/test/xvector.scp \ + > ${exp_dir}/embeddings/sre16/eval/xvector.scp + +cat ${exp_dir}/embeddings/sre18/dev/enrollment/enroll_mdl_xvector.scp \ + ${exp_dir}/embeddings/sre18/dev/test/xvector.scp \ + > ${exp_dir}/embeddings/sre18/dev/xvector.scp + +cat ${exp_dir}/embeddings/sre18/eval/enrollment/enroll_mdl_xvector.scp \ + ${exp_dir}/embeddings/sre18/eval/test/xvector.scp \ + > ${exp_dir}/embeddings/sre18/eval/xvector.scp + +cat ${exp_dir}/embeddings/sre21/dev/enrollment/enroll_mdl_xvector.scp \ + ${exp_dir}/embeddings/sre21/dev/test/xvector.scp \ + > ${exp_dir}/embeddings/sre21/dev/xvector.scp + +cat ${exp_dir}/embeddings/sre21/eval/enrollment/enroll_mdl_xvector.scp \ + ${exp_dir}/embeddings/sre21/eval/test/xvector.scp \ + > ${exp_dir}/embeddings/sre21/eval/xvector.scp + + + +echo "Embedding dir is (${exp_dir}/embeddings)." + + diff --git a/examples/sre/v3/local/filter_utt_accd_dur.py b/examples/sre/v3/local/filter_utt_accd_dur.py new file mode 120000 index 00000000..7dcfa78a --- /dev/null +++ b/examples/sre/v3/local/filter_utt_accd_dur.py @@ -0,0 +1 @@ +../../../sre/v2/local/filter_utt_accd_dur.py \ No newline at end of file diff --git a/examples/sre/v3/local/generate_sre_aug.py b/examples/sre/v3/local/generate_sre_aug.py new file mode 120000 index 00000000..74265285 --- /dev/null +++ b/examples/sre/v3/local/generate_sre_aug.py @@ -0,0 +1 @@ +../../../sre/v2/local/generate_sre_aug.py \ No newline at end of file diff --git a/examples/sre/v3/local/m4a2wav.pl b/examples/sre/v3/local/m4a2wav.pl new file mode 120000 index 00000000..5c74d678 --- /dev/null +++ b/examples/sre/v3/local/m4a2wav.pl @@ -0,0 +1 @@ +../../../voxceleb/v2/local/m4a2wav.pl \ No newline at end of file diff --git a/examples/sre/v3/local/make_sre16_eval.pl b/examples/sre/v3/local/make_sre16_eval.pl new file mode 100755 index 00000000..6e50e8a0 --- /dev/null +++ b/examples/sre/v3/local/make_sre16_eval.pl @@ -0,0 +1,208 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +use File::Basename; + +# Copyright 2017 David Snyder +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# Apache 2.0 +# + +# This script is taken from the Kaldi SRE16 recipe. For the Wespeaker recipe, we +# have done a few very minor changes, namely: +# 1. The path to the keys tar file are provided as an additional input argument. +# 2. The produced wav.scp will use ffmpeg instead of sph2pipe. +# 3. Some changes in paths to fit wespeaker recipe. +# 4. Warning if wav files have no meta data. Mainly happens if the directory +# searched for wav files contains files that are not in the original data. +# 5 Formatting to fit Wespeaker's requirement. + +if (@ARGV != 3) { + print STDERR "Usage: $0 " , + " \n"; + print STDERR "e.g. $0 /export/corpora/SRE/R149_0_1 data/\n"; + exit(1); +} + +($db_base, $evalset_keys, $out_dir) = @ARGV; + +# Handle enroll +$out_dir_enroll = "$out_dir/sre16/eval/enroll"; +if (system("mkdir -p $out_dir_enroll")) { + die "Error making directory $out_dir_enroll"; +} + +$tmp_dir_enroll = "$out_dir_enroll/tmp"; +if (system("mkdir -p $tmp_dir_enroll") != 0) { + die "Error making directory $tmp_dir_enroll"; +} + +open(SPKR, ">$out_dir_enroll/utt2spk") + || die "Could not open the output file $out_dir_enroll/utt2spk"; +open(WAV, ">$out_dir_enroll/wav.scp") + || die "Could not open the output file $out_dir_enroll/wav.scp"; +open(META, "<$db_base/docs/sre16_eval_enrollment.tsv") + or die "cannot open wav list"; +%utt2fixedutt = (); +while () { + $line = $_; + @toks = split(" ", $line); + $spk = $toks[0]; + $utt = $toks[1]; + if ($utt ne "segment") { + print SPKR "${spk}-${utt} $spk\n"; + $utt2fixedutt{$utt} = "${spk}-${utt}"; + } +} + +# Using cmd here and a few other places to satisfy the 80 char. requirement. +my $cmd1="find $db_base/data/enrollment/ -name '*.sph'". + " > $tmp_dir_enroll/sph.list"; +if (system($cmd1) != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir_enroll/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $utt=$utt2fixedutt{$t1[0]}; + if ($utt) { + print WAV "$utt", + " ffmpeg -nostdin -i $sph -ac 1 -ar 8000 -f wav pipe:1 |\n"; + }else { + print("WARNING $t1[0] not in meta data. Will not be used.\n"); + } +} +close(WAV) || die; +close(SPKR) || die; + +# Handle test +$out_dir_test= "$out_dir/sre16/eval/test"; +if (system("mkdir -p $out_dir_test")) { + die "Error making directory $out_dir_test"; +} + +$tmp_dir_test = "$out_dir_test/tmp"; +if (system("mkdir -p $tmp_dir_test") != 0) { + die "Error making directory $tmp_dir_test"; +} + + +if (system("cp $evalset_keys $out_dir_test")) { + die "Error copying sre16 keys."; +} + +my $key_name = basename( $evalset_keys ); + +if (system("tar -xvf $out_dir_test/$key_name -C $out_dir_test")) { + die "Could not untar sre16 keys."; +} + + +open(SPKR, ">$out_dir_test/utt2spk") + || die "Could not open the output file $out_dir_test/utt2spk"; +open(WAV, ">$out_dir_test/wav.scp") + || die "Could not open the output file $out_dir_test/wav.scp"; +open(TRIALS, ">$out_dir_test/trials") + || die "Could not open the output file $out_dir_test/trials"; +open(TGL_TRIALS, ">$out_dir_test/trials_tgl") + || die "Could not open the output file $out_dir_test/trials_tgl"; +open(YUE_TRIALS, ">$out_dir_test/trials_yue") + || die "Could not open the output file $out_dir_test/trials_yue"; + +my $cmd2="find $db_base/data/test/ -name '*.sph' > $tmp_dir_test/sph.list"; +if (system($cmd2) != 0) { + die "Error getting list of sph files"; +} + + +open(KEY, "<$out_dir_test/R149_0_1/docs/sre16_eval_trial_key.tsv") + || die "Could not open trials file", + " $out_dir_test/R149_0_1/docs/sre16_eval_trial_key.tsv."; +open(SEG_KEY, "<$out_dir_test/R149_0_1/docs/sre16_eval_segment_key.tsv") + || die "Could not open trials file", + " $out_dir_test/R149_0_1/docs/sre16_eval_segment_key.tsv."; +open(LANG_KEY, "<$out_dir_test/R149_0_1/metadata/calls.tsv") + || die " Could not open trials file", + " $out_dir_test/R149_0_1/metadata/calls.tsv."; +open(WAVLIST, "<$tmp_dir_test/sph.list") or die "cannot open wav list"; + +%utt2call = (); +while() { + chomp; + $line = $_; + @toks = split(" ", $line); + $utt = $toks[0]; + $call = $toks[1]; + if ($utt ne "segment") { + $utt2call{$utt} = $call; + } +} +close(SEG_KEY) || die; + +%call2lang = (); +while() { + chomp; + $line = $_; + @toks = split(" ", $line); + $call = $toks[0]; + $lang = $toks[1]; + $call2lang{$call} = $lang; +} +close(LANG_KEY) || die; + +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $utt=$t1[0]; + print WAV "$utt"," ffmpeg -nostdin -i $sph -ac 1 -ar 8000 -f wav pipe:1 |\n"; + print SPKR "$utt $utt\n"; +} +close(WAV) || die; +close(SPKR) || die; + +while () { + $line = $_; + @toks = split(" ", $line); + $spk = $toks[0]; + $utt = $toks[1]; + $call = $utt2call{$utt}; + $target_type = $toks[3]; + if ($utt ne "segment") { + print TRIALS "${spk} ${utt} ${target_type}\n"; + if ($call2lang{$call} eq "tgl") { + print TGL_TRIALS "${spk} ${utt} ${target_type}\n"; + } elsif ($call2lang{$call} eq "yue") { + print YUE_TRIALS "${spk} ${utt} ${target_type}\n"; + } else { + die "Unexpected language $call2lang{$call} for utterance $utt."; + } + } +} + +close(TRIALS) || die; +close(TGL_TRIALS) || die; +close(YUE_TRIALS) || die; + +my $cmd3="tools/utt2spk_to_spk2utt.pl". + " $out_dir_enroll/utt2spk >$out_dir_enroll/spk2utt"; +if (system($cmd3) != 0) { + die "Error creating spk2utt file in directory $out_dir_enroll"; +} + +my $cmd4="tools/utt2spk_to_spk2utt.pl $out_dir_test/utt2spk >$out_dir_test/spk2utt"; +if (system($cmd4) != 0) { + die "Error creating spk2utt file in directory $out_dir_test"; +} + +if (system("tools/fix_data_dir.sh $out_dir_enroll") != 0) { + die "Error fixing data dir $out_dir_enroll"; +} +if (system("tools/fix_data_dir.sh $out_dir_test") != 0) { + die "Error fixing data dir $out_dir_test"; +} diff --git a/examples/sre/v3/local/make_sre16_unlabeled.pl b/examples/sre/v3/local/make_sre16_unlabeled.pl new file mode 100755 index 00000000..4766cde1 --- /dev/null +++ b/examples/sre/v3/local/make_sre16_unlabeled.pl @@ -0,0 +1,111 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2017 David Snyder +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz): Just some minor changes +# in paths to fit the Wespeaker recipe organization as well +# formatting to fit Wespeaker's requirements. +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 ", " + \n"; + print STDERR "e.g. $0", + "/export/corpora/SRE/LDC2016E46_SRE16_Call_My_Net_Training_Data data/\n"; + exit(1); +} + +($db_base, $out_dir) = @ARGV; + +# Handle major subset. +$out_dir_major = "$out_dir/sre16/major"; +if (system("mkdir -p $out_dir_major")) { + die "Error making directory $out_dir_major"; +} + +$tmp_dir_major = "$out_dir_major/tmp"; +if (system("mkdir -p $tmp_dir_major") != 0) { + die "Error making directory $tmp_dir_major"; +} + +open(SPKR, ">$out_dir_major/utt2spk") + || die "Could not open the output file $out_dir_major/utt2spk"; +open(WAV, ">$out_dir_major/wav.scp") + || die "Could not open the output file $out_dir_major/wav.scp"; + +my $cmd1="find $db_base/data/unlabeled/major/ -name '*.sph'". + " > $tmp_dir_major/sph.list"; +if (system($cmd1) != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir_major/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $utt=$t1[0]; + print WAV "$utt"," ffmpeg -nostdin -i $sph -ac 1 -ar 8000 -f wav pipe:1 |\n"; + print SPKR "$utt $utt\n"; +} + +close(WAV) || die; +close(SPKR) || die; + +# Handle minor subset. +$out_dir_minor= "$out_dir/sre16/minor"; +if (system("mkdir -p $out_dir_minor")) { + die "Error making directory $out_dir_minor"; +} + +$tmp_dir_minor = "$out_dir_minor/tmp"; +if (system("mkdir -p $tmp_dir_minor") != 0) { + die "Error making directory $tmp_dir_minor"; +} + +open(SPKR, ">$out_dir_minor/utt2spk") + || die "Could not open the output file $out_dir_minor/utt2spk"; +open(WAV, ">$out_dir_minor/wav.scp") + || die "Could not open the output file $out_dir_minor/wav.scp"; + +my $cmd2="find $db_base/data/unlabeled/minor/ -name '*.sph'". + " > $tmp_dir_minor/sph.list"; +if (system($cmd2) != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir_minor/sph.list") + or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $utt=$t1[0]; + print WAV "$utt"," ffmpeg -nostdin -i $sph -ac 1 -ar 8000 -f wav pipe:1 |\n"; + print SPKR "$utt $utt\n"; +} +close(WAV) || die; +close(SPKR) || die; + +my $cmd3="tools/utt2spk_to_spk2utt.pl $out_dir_major/utt2spk". + ">$out_dir_major/spk2utt"; +if (system($cmd3) != 0) { + die "Error creating spk2utt file in directory $out_dir_major"; +} + +my $cmd4="tools/utt2spk_to_spk2utt.pl $out_dir_minor/utt2spk". + " > $out_dir_minor/spk2utt"; +if (system($cmd4) != 0) { + die "Error creating spk2utt file in directory $out_dir_minor"; +} + +if (system("tools/fix_data_dir.sh $out_dir_major") != 0) { + die "Error fixing data dir $out_dir_major"; +} + +if (system("tools/fix_data_dir.sh $out_dir_minor") != 0) { + die "Error fixing data dir $out_dir_minor"; +} diff --git a/examples/sre/v3/local/make_system_sad.py b/examples/sre/v3/local/make_system_sad.py new file mode 100644 index 00000000..2e32b556 --- /dev/null +++ b/examples/sre/v3/local/make_system_sad.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 Xu Xiang +# 2023 Zhengyang Chen +# 2024 Johan Rohdin +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" +os.environ["VECLIB_MAXIMUM_THREADS"] = "1" +os.environ["NUMEXPR_NUM_THREADS"] = "1" + +import sys +import io +import functools +import concurrent.futures +import argparse +import importlib +import torchaudio +import subprocess + +import torch + + +def get_args(): + parser = argparse.ArgumentParser(description='') + parser.add_argument('--repo-path', required=True, + help='VAD model repo path') + parser.add_argument('--scp', required=True, help='wav scp') + parser.add_argument('--min-duration', required=True, + type=float, help='min duration') + args = parser.parse_args() + + return args + + +@functools.lru_cache(maxsize=1) +def load_wav( + wav_rxfilename, +): + """ This function reads audio file and return data in pytorch tensor. + "lru_cache" holds recently loaded audio so that can be called + many times on the same audio file. + OPTIMIZE: controls lru_cache size for random access, + considering memory size + """ + if wav_rxfilename.endswith('|'): + # input piped command + p = subprocess.Popen(wav_rxfilename[:-1], shell=True, + stdout=subprocess.PIPE) + data, samplerate = torchaudio.load(io.BytesIO(p.stdout.read())) + elif wav_rxfilename == '-': + # stdin + data, samplerate = torchaudio.load(sys.stdin) + else: + # normal wav file + data, samplerate = torchaudio.load(wav_rxfilename) + return data.squeeze(0), samplerate + + +def read_scp(scp): + utt_wav_pair = [] + for line in open(scp, 'r'): + segs = line.strip().split() + if len(segs) > 2: + utt, wav = segs[0], ' '.join(segs[1:]) + else: + utt, wav = segs[0], segs[1] + utt_wav_pair.append((utt, wav)) + + return utt_wav_pair + + +def silero_vad(utt_wav_pair, repo_path, min_duration, + sampling_rate=8000, threshold=0.25): + + def module_from_file(module_name, file_path): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + utils_vad = module_from_file("utils_vad", + os.path.join(repo_path, "utils_vad.py")) + model = utils_vad.init_jit_model( + os.path.join(repo_path, 'files/silero_vad.jit')) + + utt, wav = utt_wav_pair + + wav_f = wav + + wav, sr = load_wav(wav) + assert sr == sampling_rate, "Audio file {} has wrong sampling rate \ + ({} instead of {})".format(wav_f, sr, sampling_rate) + + speech_timestamps = utils_vad.get_speech_timestamps( + wav, model, sampling_rate=sampling_rate, + threshold=threshold) + + vad_result = "" + for item in speech_timestamps: + begin = item['start'] / sampling_rate + end = item['end'] / sampling_rate + if end - begin >= min_duration: + vad_result += "{}-{:08d}-{:08d} {} {:.3f} {:.3f}\n".format( + utt, int(begin * 1000), int(end * 1000), utt, begin, end) + + return vad_result + + +def main(): + args = get_args() + + vad = functools.partial(silero_vad, + repo_path=args.repo_path, + min_duration=args.min_duration) + utt_wav_pair_list = read_scp(args.scp) + # with concurrent.futures.ProcessPoolExecutor() as executor: + # print(''.join(executor.map(vad, utt_wav_pair_list)), end='') + # It seems the pool doesn't work well so split into chunks of max size n + # (e.g. 10000). Splitting like this also has the consequence that the VAD + # is printed after processing n files instead of after processing all files. + n = 10000 + utt_wav_pair_list_of_list = [utt_wav_pair_list[i * n:(i + 1) * n] + for i in range((len(utt_wav_pair_list) + n - 1) // n)] + for lol in utt_wav_pair_list_of_list: + with concurrent.futures.ProcessPoolExecutor() as executor: + print(''.join(executor.map(vad, lol)), end='') + + +if __name__ == '__main__': + torch.set_num_threads(1) + + main() diff --git a/examples/sre/v3/local/prepare_cts_superset.sh b/examples/sre/v3/local/prepare_cts_superset.sh new file mode 100755 index 00000000..b93f8387 --- /dev/null +++ b/examples/sre/v3/local/prepare_cts_superset.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -o pipefail + +export LC_ALL=C + + +data_cts=data/cts/ +cts_superset_dir="" +wav_dir=wav/cts/ + +. tools/parse_options.sh || exit 1 + +echo $cts_superset_dir + +if [ ! -f $cts_superset_dir/docs/cts_superset_segment_key.tsv ];then + echo "ERROR: $cts_superset_dir/docs/cts_superset_segment_key.tsv does not exist." + exit 1 +fi + +mkdir -p $data_cts + + +echo -n "" > ${data_cts}/wav.scp +for x in $(tail -n +2 $cts_superset_dir/docs/cts_superset_segment_key.tsv | cut -f 1 | sed "s:\.sph::" );do + echo "$x ffmpeg -nostdin -i ${cts_superset_dir}/data/${x}.sph -ar 8000 -f wav pipe:1 |" >> $data_cts/wav.scp +done + + +tail -n +2 $cts_superset_dir/docs/cts_superset_segment_key.tsv | cut -f 1,3 --output-delimiter=" " | sed "s:\.sph::" | sort > ${data_cts}/utt2spk + +tools/utt2spk_to_spk2utt.pl ${data_cts}/utt2spk > ${data_cts}/spk2utt diff --git a/examples/sre/v3/local/prepare_data.sh b/examples/sre/v3/local/prepare_data.sh new file mode 100755 index 00000000..6e5c7879 --- /dev/null +++ b/examples/sre/v3/local/prepare_data.sh @@ -0,0 +1,307 @@ +#!/bin/bash + +# Copyright (c) 2023 Zhengyang Chen (chenzhengyang117@gmail.com) +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +stage=-1 +stop_stage=-1 +#sre_data_dir= +data=data + +### +sre16_unlab_dir="" +sre16_evalset_dir="" +sre16_evalset_keys="" +### +sre18_devset_dir="" +sre18_evalset_dir="" +sre18_evalset_keys="" +### +sre21_devset_dir="" +sre21_evalset_dir="" +sre21_evalset_keys="" +### +cts_superset_dir="" +### +voxceleb_dir="" + +compute_total_utterance_duration=true # Whether to compute the total utterance duration, i.e., including no speech parts + # Can be used as an addition filtering requirement. Currently only supported for + # VoxCeleb. +compute_vad_for_voxceleb=true +include_voxceleb_vad_in_train_data=true # If false, only CTS vad will be inluded which means that VAD will not be applied for VoxCeleb during training. + +. tools/parse_options.sh || exit 1 + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + mkdir -p external_tools + # Download voice activity detection model pretrained by Silero Team + wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v4.0.zip -O external_tools/silero-vad-v4.0.zip + unzip -o external_tools/silero-vad-v4.0.zip -d external_tools +fi + + +### SRE16 +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # We use the scripts from the Kaldi SRE16 recipe with some minor modifications. + + # Prepare NIST SRE 2016 evaluation data. + local/make_sre16_eval.pl $sre16_evalset_dir $sre16_evalset_keys data + + # Prepare unlabeled Cantonese and Tagalog development data. This dataset + # was distributed to SRE participants. + local/make_sre16_unlabeled.pl $sre16_unlab_dir data +fi + + +### SRE18 +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "Preparing SRE18" + local/prepare_sre18.sh --stage 1 --stop_stage 1 --sre18_dev_dir $sre18_devset_dir --sre18_eval_dir $sre18_evalset_dir --sre18_eval_keys_file $sre18_evalset_keys --data_dir $data/sre18 +fi + + +### SRE21 +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "Preparing SRE21" + local/prepare_sre21.sh --stage 1 --stop_stage 1 --sre21_dev_dir $sre21_devset_dir --sre21_eval_dir $sre21_evalset_dir --sre21_eval_keys_file $sre21_evalset_keys --data_dir $data/sre21 +fi + + +### CTS +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + echo "Preparing CTS" + local/prepare_cts_superset.sh --cts_superset_dir $cts_superset_dir --data_cts $data/cts --wav_dir `pwd`/wav/cts + + + # Only mixer data. Used for backend training. Create only lists here. + # The data directory will be created later, after VAD. + awk -F"\t" '{if($7 == "mx3" || $7 == "mx45" || $7 == "mx6"){print $0} }' ${cts_superset_dir}/docs/cts_superset_segment_key.tsv \ + > data/cts_superset_segment_key_mx3456.tsv + cut -f 1 data/cts_superset_segment_key_mx3456.tsv | sed s:\\.sph$:: > data/mx_3456.list + +fi + + +### VoxCeleb +# We are using all of VoxCeleb 1 and the training (aka "development") part of VoxCeleb 2. +# (The test part of VoxCeleb 2) may have some overlap with VoxCeleb 1. See +# https://www.robots.ox.ac.uk/~vgg/publications/2019/Nagrani19/nagrani19.pdf, Table 4.) +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + + echo "Preparing VoxCeleb" + if [[ $voxceleb_dir == "" ]];then + echo "Preparing Voxceleb, rirs and Musan" + voxceleb_dir=${data}_vox + mkdir ${voxceleb_dir} + local/prepare_vox.sh --stage 1 --stop_stage 4 --data ${data}_vox + fi + + if [[ ! -d $voxceleb_dir/vox1 || ! -d $voxceleb_dir/vox2_dev ]];then + echo "ERROR: Problem with Voxceleb data directory." + exit 1 + fi + + # Downsample VoxCeleb and apply GSM. We create a new wav.scp with this command in the + # extraction chain rather than creating the new wav files explicitly. + sox_command='-t gsm -r 8000 - | sox -t gsm -r 8000 - -t wav -r 8000 -c 1 -e signed-integer -' + for dset in vox1 vox2_dev;do + tools/copy_data_dir.sh $voxceleb_dir/$dset $data/${dset}_gsmfr + awk -v sc="$sox_command" '{print $1 " sox " $2 " " sc " |" }' $voxceleb_dir/$dset/wav.scp > $data/${dset}_gsmfr/wav.scp + done + + # Combine all Voxceleb data + tools/combine_data.sh data/vox_gsmfr data/vox1_gsmfr/ data/vox2_dev_gsmfr/ + + # Copy rirs and musan from voxceleb. We don't need to downsample as this will be + # done on-the-fly. If the direcotires already contain the data in lmdb format + # we just link it. Otherwise we copy it and let later stages create the lmdb + # format data here. Since we don't want to affect the original data. + for x in rirs musan;do + if [ -d $voxceleb_dir/$x/lmdb ];then + ln -s $voxceleb_dir/$x $data/ + else + mkdir $data/$x + cp -r $voxceleb_dir/$x/wav.scp $data/$x/wav.scp + fi + done + +fi + + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + + echo "Get vad segmentation for dataset." + true && { + # Set VAD min duration + min_duration=0.25 + for dset in vox_gsmfr cts sre18/dev/test sre18/dev/enrollment sre18/dev/unlabeled sre18/eval/test sre18/eval/enrollment sre21/dev/test sre21/dev/enrollment sre21/eval/test sre21/eval/enrollment sre16_major sre16/eval/enrollment sre16/eval/test; do + python3 local/make_system_sad.py \ + --repo-path external_tools/silero-vad-4.0 \ + --scp ${data}/${dset}/wav.scp \ + --min-duration $min_duration > ${data}/${dset}/vad + cp -r ${data}/${dset} ${data}/${dset}-bk # Since VAD is quite time-consuming, it is good to have a backup. + done + } + + true && { + # We may consider to use only the mixer portion of the CTS data for backen training + # as it may be closer to the SRE data. + + tools/subset_data_dir.sh --utt-list data/mx_3456.list data/cts data/mx_3456 + tools/filter_scp.pl -f 2 ${data}/mx_3456/wav.scp ${data}/cts/vad > ${data}/mx_3456/vad + + + # For PLDA training, it is better to augment the training data + python3 local/generate_sre_aug.py --ori_dir ${data}/mx_3456 \ + --aug_dir ${data}/mx_3456_aug \ + --aug_copy_num 2 + + tools/utt2spk_to_spk2utt.pl ${data}/mx_3456_aug/utt2spk > ${data}/mx_3456_aug/spk2utt + } + + true && { + # We may consider to use only the mixer portion of the CTS data for backend training + # as it may be closer to the SRE data. + + # For PLDA training, it is better to augment the training data + python3 local/generate_sre_aug.py --ori_dir ${data}/cts \ + --aug_dir ${data}/cts_aug \ + --aug_copy_num 2 + + tools/utt2spk_to_spk2utt.pl ${data}/cts_aug/utt2spk > ${data}/cts_aug/spk2utt + } + +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + + true && { + for dset in cts vox_gsmfr; do + echo $dset + if [ -f ${data}/${dset}/vad ] && ( [ $dset != "vox_gsmfr" ] || $compute_vad_for_voxceleb ) ;then + echo "Using VAD info" + python3 local/utt2voice_duration.py \ + --vad_file ${data}/${dset}/vad \ + --utt2voice_dur ${data}/${dset}/utt2voice_dur + cp ${data}/${dset}/utt2voice_dur ${data}/${dset}-bk/ # Good to have backup also of this + fi + done + } + + true && { + # The below need to be improved to work for a general wav.scp. It only works for the specif format of voxceleb wav.scp + # at the moment. + for dset in vox_gsmfr; do + if $compute_total_utterance_duration; then + # We may, for example, avoid applying VAD on VoxCeleb in which case we need this. + # Note that the durations are estimated on the original wave file, before sox + # downsampling and GSM codec is applied. + echo "Using soxi" + + cut -f3 -d" " ${data}/${dset}/wav.scp | awk '{ print "soxi -D " $0 }' > ${data}/${dset}/soxi_cmd.sh + split -a 4 -d -n l/12 ${data}/${dset}/soxi_cmd.sh ${data}/${dset}/soxi_cmd.split. + for i in {0000..11}; do + bash ${data}/${dset}/soxi_cmd.split.$i > ${data}/${dset}/soxi_cmd.split.$i.out & + done + wait + + for i in {0000..11}; do cat ${data}/${dset}/soxi_cmd.split.$i.out; done > ${data}/${dset}/dur_tmp + cut -f1 -d" " ${data}/${dset}/wav.scp > ${data}/${dset}/utt_tmp + paste -d " " ${data}/${dset}/utt_tmp ${data}/${dset}/dur_tmp > ${data}/${dset}/utt2dur + + rm ${data}/${dset}/soxi_cmd.* ${data}/${dset}/vox_gsmfr/dur_tmp ${data}/${dset}/utt_tmp + + cp ${data}/${dset}/utt2dur ${data}/${dset}-bk/ # Good to have backup also of this + fi + done + } +fi + +if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then + + declare -A voice_dur_threshold=( ["cts"]=5.0 ["vox_gsmfr"]=0.0 ) # Note that a threshold of 0.0 still means that utterances with no speech + # according to VAD will be discarded at this stage. So if we want to keep + # them, we should skip block 1 for the set instead. + declare -A dur_threshold=( ["cts"]=0.0 ["vox_gsmfr"]=5.0 ) + + declare -A uttPerSpk_threshold=( ["cts"]=2 ["vox_gsmfr"]=2 ) # Kept if more than this threshold. (I.e. equality not sufficient.) + + true && { + # Following the Kaldi recipe: https://github.com/kaldi-asr/kaldi/blob/71f38e62cad01c3078555bfe78d0f3a527422d75/egs/sre16/v2/run.sh#L189 + # We filter out the utterances with duration less than 5s + echo "Stage 9, block 1" + echo "Applying filtering based on voice duration " + #for dset in cts vox_gsmfr; do + for dset in cts; do + n_utt_before=$( wc -l ${data}/${dset}/utt2spk | cut -f1 -d " " ) + n_spk_before=$( wc -l ${data}/${dset}/spk2utt | cut -f1 -d " " ) + python3 local/filter_utt_accd_dur.py \ + --wav_scp ${data}/${dset}/wav.scp \ + --utt2voice_dur ${data}/${dset}/utt2voice_dur \ + --filter_wav_scp ${data}/${dset}/filter_wav.scp \ + --dur_thres ${voice_dur_threshold[$dset]} + mv ${data}/${dset}/wav.scp ${data}/${dset}/wav.scp.bak + mv ${data}/${dset}/filter_wav.scp ${data}/${dset}/wav.scp + tools/fix_data_dir.sh ${data}/${dset} + echo " $dset " + echo " #utt / #spk before: $n_utt_before / $n_spk_before " + n_utt_after=$( wc -l ${data}/${dset}/utt2spk | cut -f1 -d " " ) + n_spk_after=$( wc -l ${data}/${dset}/spk2utt | cut -f1 -d " " ) + echo " #utt / #spk after: $n_utt_after / $n_spk_after " + done + } + echo "Stage 9, block 2" + echo "Applying filtering based on the whole utterance duration (including non-speech parts) " + #for dset in cts vox_gsmfr; do + for dset in vox_gsmfr; do + n_utt_before=$( wc -l ${data}/${dset}/utt2spk | cut -f1 -d " " ) + n_spk_before=$( wc -l ${data}/${dset}/spk2utt | cut -f1 -d " " ) + python3 local/filter_utt_accd_dur.py \ + --wav_scp ${data}/${dset}/wav.scp \ + --utt2voice_dur ${data}/${dset}/utt2dur \ + --filter_wav_scp ${data}/${dset}/filter_wav.scp \ + --dur_thres ${dur_threshold[$dset]} + mv ${data}/${dset}/wav.scp ${data}/${dset}/wav.scp.bak + mv ${data}/${dset}/filter_wav.scp ${data}/${dset}/wav.scp + tools/fix_data_dir.sh ${data}/${dset} + echo " $dset " + echo " #utt / #spk before: $n_utt_before / $n_spk_before " + n_utt_after=$( wc -l ${data}/${dset}/utt2spk | cut -f1 -d " " ) + n_spk_after=$( wc -l ${data}/${dset}/spk2utt | cut -f1 -d " " ) + echo " #utt / #spk after: $n_utt_after / $n_spk_after " + done + + + # Similarly, following the Kaldi recipe, + # we throw out speakers with fewer than 3 utterances. + echo "Stage 9, block 3" + for dset in cts vox_gsmfr; do + #tools/fix_data_dir.sh ${data}/${dset} + cp ${data}/${dset}/spk2utt ${data}/${dset}/spk2utt.bak + awk -v thr=${uttPerSpk_threshold[$dset]} '{if(NF>thr){print $0}}' ${data}/${dset}/spk2utt.bak > ${data}/${dset}/spk2utt + tools/spk2utt_to_utt2spk.pl ${data}/${dset}/spk2utt > ${data}/${dset}/utt2spk + tools/fix_data_dir.sh ${data}/${dset} + done + + ./tools/combine_data.sh data/cts_vox data/cts/ data/vox_gsmfr + if $include_voxceleb_vad_in_train_data;then + cat data/cts/vad data/vox_gsmfr/vad > data/cts_vox/vad + else + cat data/cts/vad > data/cts_vox/vad + fi +fi + + diff --git a/examples/sre/v3/local/prepare_sre18.sh b/examples/sre/v3/local/prepare_sre18.sh new file mode 100755 index 00000000..2952b0e3 --- /dev/null +++ b/examples/sre/v3/local/prepare_sre18.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright (c) 2023 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +set -o pipefail + +export LC_ALL=C + + +sre18_dev_dir="" +sre18_eval_dir="" +sre18_eval_keys_file="" +data_dir=data/sre18 +wav_dir=wav/ +stage=1 +stop_stage=1 + +. tools/parse_options.sh || exit 1 + +echo "sre18 dev dir: $sre18_dev_dir" +echo "sre18 eval dir: $sre18_eval_dir" +echo "sre18 eval keys file: $sre18_eval_keys_file" + +declare -A set2dir=( ["dev"]=$sre18_dev_dir ["eval"]=$sre18_eval_dir ) +declare -A set2subset=( ["dev"]="enrollment test unlabeled" ["eval"]="enrollment test" ) + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + + + for z in dev eval;do + src_dir=${set2dir[$z]} + + echo "Processing SRE 18 $z set from $src_dir" + + + true && { + for s in ${set2subset[$z]};do + + tgt_dir=$data_dir/$z/$s + echo " - $s set. Storing in $tgt_dir" + mkdir -p $tgt_dir + + if [ -f $tgt_dir/wav.scp ];then + rm $tgt_dir/wav.scp + fi + + # Create the wav files + for x in $( ls $src_dir/data/${s}/ );do + name=$(basename $x .sph) + if [ $name != $x ];then + # suffix is .sph + echo "$name ffmpeg -nostdin -i $src_dir/data/${s}/$x -ar 8000 -f wav pipe:1 |" >> $tgt_dir/wav.scp + else + name=$(basename $x .flac) + if [ $name != $x ];then + # suffix is .flac + # From http://trac.ffmpeg.org/wiki/audio%20types:"The default for muxing + # into WAV files is pcm_s16le." so the below should be ok. + echo "$name ffmpeg -nostdin -i $src_dir/data/${s}/$x -ar 8000 -f wav pipe:1 |" >> $tgt_dir/wav.scp + else + echo "ERROR: Invalid suffix in file $x" + exit 1 + fi + fi + done + done + } + + # Mappings for "enrollment models" <-> "utterances" + # The evaluation consider enrollment "models" rather than enrollment "speakers". Possibly several models could be + # from the same speaker. There speaker ID of the models are not known. So we can't create "spk2utt" and utt2spk". + # For test data there is no such mappings either. + grep -v modelid $src_dir/docs/sre18_${z}_enrollment.tsv | cut -f1,2 | sed "s:\t: :" > $data_dir/$z/enrollment/enrollment.txt + cat $data_dir/$z/enrollment/enrollment.txt | sed "s:.sph$::" | sed "s:.flac$: :" \ + | awk '{print $2 " " $1}' > $data_dir/$z/enrollment/utt2mdl_id + # No utterance is used in more than one mdl so utt2mdl_id makes sense. + ./tools/utt2spk_to_spk2utt.pl $data_dir/$z/enrollment/utt2mdl_id > $data_dir/$z/enrollment/mdl_id2utt + + true && { + # Trial list and keys. Not available in the eval directory so the specified file is used. + if [ $z == "eval" ];then + cp $sre18_eval_keys_file $data_dir/$z/ + key_name=$( basename $sre18_eval_keys_file .tbz2) + tar -xvf $data_dir/$z/${key_name}.tbz2 -C $data_dir/$z/ + key_file=$data_dir/$z/LDC2018E51/docs/sre18_eval_trial_key.tsv + else + key_file=$src_dir/docs/sre18_dev_trial_key.tsv + fi + } + + tail -n+2 $key_file | cut -f1,2,4 | sed "s:\.sph::" | sed "s:\.flac::" | sed "s:\t: :g" > $data_dir/$z/sre18_${z}_trials + + + done +fi + + diff --git a/examples/sre/v3/local/prepare_sre21.sh b/examples/sre/v3/local/prepare_sre21.sh new file mode 100755 index 00000000..80bacacc --- /dev/null +++ b/examples/sre/v3/local/prepare_sre21.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +# Copyright (c) 2023 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +set -o pipefail + +export LC_ALL=C + + +sre21_dev_dir="" +sre21_eval_dir="" +sre21_eval_keys_file="" +data_dir=data/sre21 +wav_dir=wav/ #sre21_eval +stage=1 +stop_stage=1 + +. tools/parse_options.sh || exit 1 + +echo "sre21 dev dir: $sre21_dev_dir" +echo "sre21 eval dir: $sre21_eval_dir" +echo "sre21 eval keys file: $sre21_eval_keys_file" + +declare -A set2dir=( ["dev"]=$sre21_dev_dir ["eval"]=$sre21_eval_dir ) + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + + + for z in dev eval;do + src_dir=${set2dir[$z]} + + echo "Processing SRE 21 $z set from $src_dir" + + true && { + for s in enrollment test;do + + tgt_dir=$data_dir/$z/$s + echo " - $s set. Storing in $tgt_dir" + mkdir -p $tgt_dir + + if [ -f $tgt_dir/wav.scp ];then + rm $tgt_dir/wav.scp + fi + + # Create the wav files + for x in $( ls $src_dir/data/audio/${s}/ );do + name=$(basename $x .sph) + if [ $name != $x ];then + # suffix is .sph + echo "$name ffmpeg -nostdin -i $src_dir/data/audio/${s}/$x -ar 8000 -f wav pipe:1 |" >> $tgt_dir/wav.scp + else + name=$(basename $x .flac) + if [ $name != $x ];then + # suffix is .flac + # From http://trac.ffmpeg.org/wiki/audio%20types:"The default for muxing + # into WAV files is pcm_s16le." so the below should be ok. + echo "$name ffmpeg -nostdin -i $src_dir/data/audio/${s}/$x -ar 8000 -f wav pipe:1 |" >> $tgt_dir/wav.scp + else + echo "ERROR: Invalid suffix in file $x" + exit 1 + fi + fi + done + done + + + # Mappings for "enrollment models" <-> "utterances" + # The evaluation consider enrollment "models" rather than enrollment "speakers". Possibly several models could be + # from the same speaker. There speaker ID of the models are not known. So we can't create "spk2utt" and utt2spk". + # For test data there is no such mappings either. + grep -v modelid $src_dir/docs/sre21_audio_${z}_enrollment.tsv | sed "s:\t: :" > $data_dir/$z/enrollment/enrollment.txt + cat $data_dir/$z/enrollment/enrollment.txt | sed "s:.sph$::" | sed "s:.flac$: :" \ + | awk '{print $2 " " $1}' > $data_dir/$z/enrollment/utt2mdl_id + # No utterance is used in more than one mdl so utt2mdl_id makes sense. + ./tools/utt2spk_to_spk2utt.pl $data_dir/$z/enrollment/utt2mdl_id > $data_dir/$z/enrollment/mdl_id2utt + } + + # Trial list and keys. Not available in the eval directory so the specified file is used. + if [ $z == "eval" ];then + cp $sre21_eval_keys_file $data_dir/$z/ + key_name=$( basename $sre21_eval_keys_file ) + echo "tar -xvf $data_dir/$z/$key_name -C $data_dir/$z/" + tar -xvf $data_dir/$z/$key_name -C $data_dir/$z/ + key_file=$data_dir/$z/sre21/releases/LDC2021E10/docs/sre21_audio_eval_trial_key.tsv + else + key_file=$src_dir/docs/sre21_audio_dev_trial_key.tsv + fi + + + tail -n+2 $key_file | cut -f1,2,3 | sed "s:\.sph::" | sed "s:\.flac::" | sed "s:\t: :g" > $data_dir/$z/sre21_${z}_trials + + + done +fi + + diff --git a/examples/sre/v3/local/prepare_vox.sh b/examples/sre/v3/local/prepare_vox.sh new file mode 120000 index 00000000..a0b9251c --- /dev/null +++ b/examples/sre/v3/local/prepare_vox.sh @@ -0,0 +1 @@ +../../../voxceleb/v2/local/prepare_data.sh \ No newline at end of file diff --git a/examples/sre/v3/local/score.sh b/examples/sre/v3/local/score.sh new file mode 100755 index 00000000..1d84e00c --- /dev/null +++ b/examples/sre/v3/local/score.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +# Copyright (c) 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn) +# 2023 Zhengyang Chen (chenhzhengyang117@gmail.com) +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#exp_dir= +#trials="trials trials_tgl trials_yue" +#data=data + + +trials="" +xvectors="" +cal_mean_dir="" +exp_dir="" + +stage=-1 +stop_stage=-1 + +. tools/parse_options.sh +. path.sh + +echo " - trials $trials" +echo " - xvectors $xvectors" +echo " - cal_mean dir $cal_mean_dir" +echo " - exp_dir $exp_dir" + + +scores_dir=${exp_dir}/scores + +echo $cal_mean_dir + +if [ -z $cal_mean_dir ];then + cal_mean_string="--cal_mean False --cal_mean_dir xxxx" # For the moment, score.py requires something to be input to --cal_mean_dir + output_name=$(basename $xvectors | sed "s:xvector::" | sed "s:.scp::" | sed "s:^_::") # Changes xvector_proc_embd_proc_sre16_major.scp -> proc_embd_proc_sre16_major +else # xvector.scp -> '' (empty string) + cal_mean_string="--cal_mean True --cal_mean_dir $cal_mean_dir" + output_name="mean_$(echo $cal_mean_dir | sed "s:.*embeddings/::" | sed -e "s:/:_:g")" # Name will be e.g. mean_sre16_major if sre16/major data is used +fi # for mean subtraction. + + +echo $cal_mean_string +echo $output_name + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "apply cosine scoring ..." + mkdir -p ${exp_dir}/scores + for x in $(echo $trials | tr "," " "); do + echo "Trials $x" + python wespeaker/bin/score.py \ + --exp_dir ${exp_dir} \ + --eval_scp_path $xvectors \ + $cal_mean_string \ + ${x} + xx=$(basename $x) + mv ${scores_dir}/${xx}.score ${scores_dir}/${xx}.${output_name}_cos.score + done +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "compute metrics (EER/minDCF) ..." + for x in $(echo $trials | tr "," " "); do + xx=$(basename $x) + echo $xx + python wespeaker/bin/compute_metrics.py \ + --p_target 0.01 \ + --c_fa 1 \ + --c_miss 1 \ + ${scores_dir}/${xx}.${output_name}_cos.score \ + 2>&1 | tee ${scores_dir}/${xx}.${output_name}_cos.result + + echo "compute DET curve ..." + python wespeaker/bin/compute_det.py \ + ${scores_dir}/${xx}.${output_name}_cos.score + done +fi diff --git a/examples/sre/v3/local/score_plda.sh b/examples/sre/v3/local/score_plda.sh new file mode 100755 index 00000000..ac79a160 --- /dev/null +++ b/examples/sre/v3/local/score_plda.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# Copyright (c) 2023 Shuai Wang (wsstriving@gmail.com) +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +exp_dir="exp/ResNet34-TSTP-emb256-fbank40-num_frms200-aug0.6-spFalse-saFalse-Softmax-SGD-epoch10/" +data=data +trials="${data}/sre16/eval/trials ${data}/sre16/eval/trials_tgl ${data}/sre16/eval/trials_yue" +aug_plda_data=0 + +enroll_scp=sre16/eval/enrollment/xvector.scp +test_scp=sre16/eval/test/xvector.scp +utt2spk=data/sre16/eval/enrollment/utt2spk +preprocessing_chain='length-norm' +preprocessing_path="${exp_dir}/embd_proc.pkl" + +stage=-1 +stop_stage=-1 + +. tools/parse_options.sh +. path.sh + +if [ $aug_plda_data = 0 ];then + sre_plda_data=cts +else + sre_plda_data=cts_aug +fi + +echo "preprocessing_path $preprocessing_path" +preproc_name=$(basename $preprocessing_path .pkl) +echo "preproc_name $preproc_name" + + + +# Kaldi PLDA cts_aug, cts_aug mean, speaker mean last, no lnorm in PLDA +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Preparing preprocessing chain for backend " + python wespeaker/bin/prep_embd_proc.py \ + --chain "$preprocessing_chain" \ + --path $preprocessing_path + echo "Backend preprocessor prepared" +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "Applying preprocessing on PLDA training data." + python wespeaker/bin/apply_embd_proc.py \ + --path $preprocessing_path \ + --input ${exp_dir}/embeddings/${sre_plda_data}/xvector.scp \ + --output ${exp_dir}/embeddings/${sre_plda_data}/xvector_proc_$preproc_name.ark,scp +fi + + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "train the plda model ..." + python wespeaker/bin/train_plda.py \ + --exp_dir ${exp_dir} \ + --scp_path ${exp_dir}/embeddings/${sre_plda_data}/xvector_proc_$preproc_name.scp \ + --utt2spk ${data}/${sre_plda_data}/utt2spk \ + --indim 100 \ + --iter 10 + echo "plda training finished" +fi + + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "Applying preprocessing on evaluation and adaptation data." + for x in $enroll_scp $test_scp $indomain_scp;do + #new_x=$(echo $x | sed "s:\.scp:_proc\.ark,scp:") + new_x=$(echo $x | sed "s:\.scp:_proc_$preproc_name\.ark,scp:") + echo "Processing in: $x" + echo "Processing out: $new_x" + python wespeaker/bin/apply_embd_proc.py \ + --path $preprocessing_path \ + --input ${exp_dir}/embeddings/$x \ + --output ${exp_dir}/embeddings/$new_x + done +fi + + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + echo "apply plda scoring ..." + mkdir -p ${exp_dir}/scores + + enroll_scp=$(echo $enroll_scp | sed "s:\.scp:_proc_$preproc_name\.scp:") + test_scp=$(echo $test_scp | sed "s:\.scp:_proc_$preproc_name\.scp:") + + for x in $(echo $trials | tr "," " "); do + xx=$(basename $x) + echo "scoring on " $x + python wespeaker/bin/eval_plda.py \ + --enroll_scp_path ${exp_dir}/embeddings/$enroll_scp \ + --test_scp_path ${exp_dir}/embeddings/$test_scp \ + --utt2spk $utt2spk \ + --trial ${x} \ + --score_path ${exp_dir}/scores/${xx}.proc_${preproc_name}_plda.score \ + --model_path ${exp_dir}/plda + done +fi + + +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + echo "compute metrics (EER/minDCF) ..." + scores_dir=${exp_dir}/scores + for x in $(echo $trials | tr "," " "); do + xx=$(basename $x) + python wespeaker/bin/compute_metrics.py \ + --p_target 0.01 \ + --c_fa 1 \ + --c_miss 1 \ + ${scores_dir}/${xx}.proc_${preproc_name}_plda.score \ + 2>&1 | tee ${scores_dir}/${xx}.proc_${preproc_name}_plda.result + # 2>&1 | tee -a ${scores_dir}/${xx}_plda_result + + echo "compute DET curve ..." + python wespeaker/bin/compute_det.py \ + ${scores_dir}/${xx}.proc_${preproc_name}_plda.score + done +fi diff --git a/examples/sre/v3/local/score_plda_adapt.sh b/examples/sre/v3/local/score_plda_adapt.sh new file mode 100755 index 00000000..533b5484 --- /dev/null +++ b/examples/sre/v3/local/score_plda_adapt.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +# Copyright (c) 2023 Shuai Wang (wsstriving@gmail.com) +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +exp_dir=exp/ResNet34-TSTP-emb256-fbank40-num_frms200-aug0.6-spFalse-saFalse-Softmax-SGD-epoch10/ +data=data +trials="${data}/sre16/eval/trials ${data}/sre16/eval/trials_tgl ${data}/sre16/eval/trials_yue" +aug_plda_data=0 + +enroll_scp=sre16/eval/enrollment/xvector.scp +test_scp=sre16/eval/test/xvector.scp +indomain_scp=sre16/major/xvector.scp # For adaptation +utt2spk=data/sre16/eval/enrollment/utt2spk +preprocessing_path=${exp_dir}/embd_proc_sre16_major.pkl + +stage=-1 +stop_stage=-1 + +. tools/parse_options.sh +. path.sh + +if [ $aug_plda_data = 0 ];then + sre_plda_data=sre +else + sre_plda_data=sre_aug +fi + +preproc_name=$(basename $preprocessing_path .pkl) + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Applying preprocessing on evaluation and adaptation data." + for x in $enroll_scp $test_scp $indomain_scp;do + #new_x=$(echo $x | sed "s:\.scp:_proc\.ark,scp:") + new_x=$(echo $x | sed "s:\.scp:_proc_$preproc_name\.ark,scp:") + echo "Processing in: $x" + echo "Processing out: $new_x" + python wespeaker/bin/apply_embd_proc.py \ + --path $preprocessing_path \ + --input ${exp_dir}/embeddings/$x \ + --output ${exp_dir}/embeddings/$new_x + done +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "adapt the plda model ..." + + indomain_scp=$(echo $indomain_scp | sed "s:\.scp:_proc_$preproc_name\.scp:") + + python wespeaker/bin/adapt_plda.py \ + -mo ${exp_dir}/plda \ + -ma ${exp_dir}/plda_adapt \ + -ad ${exp_dir}/embeddings/$indomain_scp \ + -ws 0.75 \ + -as 0.25 + echo "plda adapted finished" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "apply plda scoring ..." + + enroll_scp=$(echo $enroll_scp | sed "s:\.scp:_proc_$preproc_name\.scp:") + test_scp=$(echo $test_scp | sed "s:\.scp:_proc_$preproc_name\.scp:") + + mkdir -p ${exp_dir}/scores + for x in $(echo $trials | tr "," " "); do + xx=$(basename $x) + echo "scoring on " $x + python wespeaker/bin/eval_plda.py \ + --enroll_scp_path ${exp_dir}/embeddings/$enroll_scp \ + --test_scp_path ${exp_dir}/embeddings/$test_scp \ + --utt2spk $utt2spk \ + --trial ${x} \ + --score_path ${exp_dir}/scores/${xx}.proc_${preproc_name}_plda_adapt.score \ + --model_path ${exp_dir}/plda_adapt + done +fi +#--indomain_scp ${exp_dir}/embeddings/$indomain_scp \ Note: This option was used before the new code for preprocessing. +# With this code, all preprocessing takes place in the preprocessing chain. So we don't include it in the above code anymore. + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "compute metrics (EER/minDCF) ..." + scores_dir=${exp_dir}/scores + for x in $(echo $trials | tr "," " "); do + xx=$(basename $x) + python wespeaker/bin/compute_metrics.py \ + --p_target 0.01 \ + --c_fa 1 \ + --c_miss 1 \ + ${scores_dir}/${xx}.proc_${preproc_name}_plda_adapt.score \ + 2>&1 | tee ${scores_dir}/${xx}.proc_${preproc_name}_plda_adapt.result + #2>&1 | tee -a ${scores_dir}/${xx}_plda_adapt_result + + echo "compute DET curve ..." + python wespeaker/bin/compute_det.py \ + ${scores_dir}/${xx}.proc_${preproc_name}_plda_adapt.score + done +fi + + + diff --git a/examples/sre/v3/local/score_plda_org.sh b/examples/sre/v3/local/score_plda_org.sh new file mode 100755 index 00000000..a601ad5f --- /dev/null +++ b/examples/sre/v3/local/score_plda_org.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# Copyright (c) 2023 Shuai Wang (wsstriving@gmail.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +exp_dir= +data=data +trials="${data}/sre16/eval/trials ${data}/sre16/eval/trials_tgl ${data}/sre16/eval/trials_yue" +aug_plda_data=0 + +enroll_scp=sre16/eval/enrollment/xvector.scp +test_scp=sre16/eval/test/xvector.scp +indomain_scp=sre16/major/xvector.scp # For mean subtraction +utt2spk=data/sre16/eval/enrollment/utt2spk + +stage=-1 +stop_stage=-1 + +. tools/parse_options.sh +. path.sh + +if [ $aug_plda_data = 0 ];then + sre_plda_data=cts +else + sre_plda_data=cts_aug +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "train the plda model ..." + python wespeaker/bin/train_plda.py \ + --exp_dir ${exp_dir} \ + --scp_path ${exp_dir}/embeddings/${sre_plda_data}/xvector.scp \ + --utt2spk ${data}/${sre_plda_data}/utt2spk \ + --indim 256 \ + --iter 200 + echo "plda training finished" +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "apply plda scoring ..." + mkdir -p ${exp_dir}/scores + for x in $(echo $trials | tr "," " "); do + xx=$(basename $x) + echo "scoring on " $x + python wespeaker/bin/eval_plda.py \ + --enroll_scp_path ${exp_dir}/embeddings/$enroll_scp \ + --test_scp_path ${exp_dir}/embeddings/$test_scp \ + --indomain_scp ${exp_dir}/embeddings/$indomain_scp \ + --utt2spk $utt2spk \ + --trial ${x} \ + --score_path ${exp_dir}/scores/${xx}.pldascore \ + --model_path ${exp_dir}/plda + done +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "compute metrics (EER/minDCF) ..." + scores_dir=${exp_dir}/scores + for x in $(echo $trials | tr "," " "); do + xx=$(basename $x) + python wespeaker/bin/compute_metrics.py \ + --p_target 0.01 \ + --c_fa 1 \ + --c_miss 1 \ + ${scores_dir}/${xx}.pldascore \ + 2>&1 | tee -a ${scores_dir}/${xx}_plda_result + + echo "compute DET curve ..." + python wespeaker/bin/compute_det.py \ + ${scores_dir}/${xx}.pldascore + done +fi diff --git a/examples/sre/v3/local/utt2voice_duration.py b/examples/sre/v3/local/utt2voice_duration.py new file mode 120000 index 00000000..65fe6b90 --- /dev/null +++ b/examples/sre/v3/local/utt2voice_duration.py @@ -0,0 +1 @@ +../../../sre/v2/local/utt2voice_duration.py \ No newline at end of file diff --git a/examples/sre/v3/path.sh b/examples/sre/v3/path.sh new file mode 100755 index 00000000..b90a5154 --- /dev/null +++ b/examples/sre/v3/path.sh @@ -0,0 +1,5 @@ +export PATH=$PWD:$PATH + +# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=../../../:$PYTHONPATH diff --git a/examples/sre/v3/qsub_extract.sh b/examples/sre/v3/qsub_extract.sh new file mode 100644 index 00000000..e3073c23 --- /dev/null +++ b/examples/sre/v3/qsub_extract.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# +#$ -cwd +#$ -V +#$ -N extract_embd +#$ -o extract_embd.out +#$ -e extract_embd.err +#$ -l gpu=4,ram_free=10G,mem_free=10G,core=2,matylda6=2,scratch=0.5,gpu_ram=16G +#$ -q long.q@@gpu + +cd /mnt/matylda6/rohdin/expts/wespeaker/wespeaker_private_test2/examples/sre/v3 +unset PYTHONPATH +unset PYTHONHOME + +# >>> conda initialize >>> +# !! Contents within this block are managed by 'conda init' !! +__conda_setup="$('/mnt/matylda5/iplchot/python_public/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" +if [ $? -eq 0 ]; then + eval "$__conda_setup" +else + if [ -f "/mnt/matylda5/iplchot/python_public/anaconda3/etc/profile.d/conda.sh" ]; then + . "/mnt/matylda5/iplchot/python_public/anaconda3/etc/profile.d/conda.sh" + else + export PATH="$PATH:/mnt/matylda5/iplchot/python_public/anaconda3/bin" + fi +fi +unset __conda_setup +# <<< conda initialize <<< + +conda activate /mnt/matylda6/rohdin/conda/wespeaker_20240220/ +which python +export PATH=$PATH:/mnt/matylda6/rohdin/software/kaldi_20210625/tools/sph2pipe/ + +./run.sh > logs/run.sh.stage4.log.1 2>&1 + + diff --git a/examples/sre/v3/qsub_train.sh b/examples/sre/v3/qsub_train.sh new file mode 100644 index 00000000..d6a6815d --- /dev/null +++ b/examples/sre/v3/qsub_train.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +#$ -cwd +#$ -V +#$ -N train_xvec +#$ -o train_xvec.out +#$ -e train_xvec.err +#$ -pe smp 16 +#$ -l gpu=0.125,ram_free=1.25G,mem_free=1.25G,matylda6=0.625,gpu_ram=16G +#$ -q long.q@@gpu +cd /mnt/matylda6/rohdin/expts/wespeaker/wespeaker_private_test2/examples/sre/v3/ # Need to change your training directory. + +unset PYTHONPATH +unset PYTHONHOME + +# >>> conda initialize >>> +# !! Contents within this block are managed by 'conda init' !! +__conda_setup="$('/mnt/matylda5/iplchot/python_public/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" +if [ $? -eq 0 ]; then + eval "$__conda_setup" +else + if [ -f "/mnt/matylda5/iplchot/python_public/anaconda3/etc/profile.d/conda.sh" ]; then + . "/mnt/matylda5/iplchot/python_public/anaconda3/etc/profile.d/conda.sh" + else + export PATH="$PATH:/mnt/matylda5/iplchot/python_public/anaconda3/bin" + fi +fi +unset __conda_setup +# <<< conda initialize <<< + +conda activate /mnt/matylda6/rohdin/conda/wespeaker_20240220/ +./run.sh > logs/run.sh.stage3.log.1 2>&1 + + diff --git a/examples/sre/v3/run.sh b/examples/sre/v3/run.sh new file mode 100755 index 00000000..ccb39d21 --- /dev/null +++ b/examples/sre/v3/run.sh @@ -0,0 +1,538 @@ +#!/bin/bash + +# Copyright 2022 Hongji Wang (jijijiang77@gmail.com) +# 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn) +# 2023 Zhengyang Chen (chenzhengyang117@gmail.com) +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) + +. ./path.sh || exit 1 + +# Stages +# 1. Data preparation +# 2. Shard / raw list creation +# 3. Training +# 4. Model averaging, embedding extraction +# 5. Export model +# 6. Cosine scoring using cts_aug, sre16_major, sre18_dev_unlabeled for mean subtraction but no other embedding processing +# 7. PLDA scoring, including length-norm, lda and subtraction of the above mentioned sets. See details at the stage. +# 8. Adapted PLDA scoring. Same embedding processing as above. +# 9. Cosine scoring with same embedding processing as above. +# 10. Summarization of results. + +stage=1 +stop_stage=1 + +data=data +data_type="shard" # shard/raw + +# whether augment the PLDA data +aug_plda_data=1 + +config=conf/resnet.yaml +exp_dir=exp/ResNet34-TSTP-emb256-fbank64-num_frms200-aug0.6-spFalse-saFalse-Softmax-SGD-epoch10 + +# gpus="[0,1]" # For slurm, just specify this according to the number of GPUs you have. +num_gpus_train=2 # If this variable is defined, safe_gpu will be used to select the free GPUs. + # If so, it will override whatever may have been specified in gpus="[x,...] + # Typically, you would want to use this option for SGE. + # If this variable is not set, or set to '', the script will assume that + # the GPUs to use are specified in the variable "gpus" as above. + +num_gpus_extract=4 # We may want to use a different value for extraction. + +num_avg=10 +checkpoint= + + +. tools/parse_options.sh || exit 1 + +############################################################################################ +# The names of various lists are not consistent across sets. Therefore we need some mappings. + +# Different sets may use different backend adaptation sets, therefore we need several trial +# lists. Using "," instead of space as separator is a bit ugly but it seems parse_options.sh +# cannot process an argument with space properly. +declare -A trials=( ["sre16_eval"]='data/sre16/eval/trials,data/sre16/eval/trials_yue,data/sre16/eval/trials_tgl' + ["sre18_dev"]="data/sre18/dev/sre18_dev_trials" + ["sre18_eval"]="data/sre18/eval/sre18_eval_trials" + ["sre21_dev"]="data/sre21/dev/sre21_dev_trials" + ["sre21_eval"]="data/sre21/eval/sre21_eval_trials" ) + +declare -A enr_scp=( ["sre16_eval"]='sre16/eval/enrollment/xvector.scp' + ["sre18_dev"]="sre18/dev/enrollment/xvector.scp" + ["sre18_eval"]="sre18/eval/enrollment/xvector.scp" + ["sre21_dev"]="sre21/dev/enrollment/xvector.scp" + ["sre21_eval"]="sre21/eval/enrollment/xvector.scp" ) + +declare -A test_scp=( ["sre16_eval"]='sre16/eval/test/xvector.scp' + ["sre18_dev"]="sre18/dev/test/xvector.scp" + ["sre18_eval"]="sre18/eval/test/xvector.scp" + ["sre21_dev"]="sre21/dev/test/xvector.scp" + ["sre21_eval"]="sre21/eval/test/xvector.scp" ) + +declare -A utt2mdl=( ["sre16_eval"]='data/sre16/eval/enrollment/utt2spk' + ["sre18_dev"]="data/sre18/dev/enrollment/utt2mdl_id" + ["sre18_eval"]="data/sre18/eval/enrollment/utt2mdl_id" + ["sre21_dev"]="data/sre21/dev/enrollment/utt2mdl_id" + ["sre21_eval"]="data/sre21/eval/enrollment/utt2mdl_id" ) + +declare -A mdl2utt=( ["sre16_eval"]='data/sre16/eval/enrollment/spk2utt' + ["sre18_dev"]="data/sre18/dev/enrollment/mdl_id2utt" + ["sre18_eval"]="data/sre18/eval/enrollment/mdl_id2utt" + ["sre21_dev"]="data/sre21/dev/enrollment/mdl_id2utt" + ["sre21_eval"]="data/sre21/eval/enrollment/mdl_id2utt" ) + +declare -A xvectors=( ["sre16_eval"]="sre16/eval/xvector.scp" + ["sre18_dev"]="sre18/dev/xvector.scp" + ["sre18_eval"]="sre18/eval/xvector.scp" + ["sre21_dev"]="sre21/dev/xvector.scp" + ["sre21_eval"]="sre21/eval/xvector.scp" ) +############################################################################################ + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Prepare datasets ..." + + + ###################################################################################### + ### Test sets. Please specify paths + # SRE16 should be prepared by the Kaldi recipe and the path should be specified here: + #sre_data_dir=/mnt/matylda4/burget/kaldi-trunk/kaldi/egs/sre16/v2/data/ + # Will be used by ./local/prepare_data.sh below. (only wav.scp, utt2spk and spk2utt files are needed.) + sre16_unlab_dir=/mnt/matylda2/data/NIST/sre16/LDC2016E46_SRE16_Call_My_Net_Training_Data + sre16_evalset_dir=/mnt/matylda2/data/NIST/sre16/R149_0_1 + # Eval keys are not in the above directory since they were distributed after the evaluation. + sre16_evalset_keys=/mnt/matylda2/data/NIST/sre16/download/sre16_evaluation_key.tar.bz2 + + # SRE18 + sre18_devset_dir=/mnt/matylda2/data/NIST/sre18/LDC2018E46_2018_NIST_Speaker_Recognition_Evaluation_Development_Set + sre18_evalset_dir=/mnt/matylda2/data/LDC/LDC2018E51_2018_NIST_Speaker_Recognition_Evaluation_Test_Set/ + # Eval keys are not in the above directory since they were distributed after the evaluation. + sre18_evalset_keys=/mnt/matylda2/data/NIST/sre18/LDC2018E51_eval_segment_key.tbz2 + + # SRE21 + sre21_devset_dir=/mnt/matylda2/data/LDC/LDC2021E09_sre21_dev_set/ + sre21_evalset_dir=/mnt/matylda2/data/LDC/LDC2021E10_sre21_eval_set/ + # Eval keys are not in the above directory since they were distributed after the evaluation. + sre21_evalset_keys=/mnt/matylda2/data/NIST/sre21/download/sre21_test_key.tgz + + + ###################################################################################### + ### Training sets + # CTS + cts_superset_dir=/mnt/matylda2/data/LDC/LDC2021E08_SRE-CTS-Superset/ + + # VoxCeleb + voxceleb_dir="/mnt/matylda6/rohdin/expts/wespeaker/wespeaker/examples/voxceleb/v2/data/" + + # This script is based on ../v2/local/prepare_data.sh + # Copies SRE16 relevant files, extracts VAD for all files, does some pruning of the training set. + ./local/prepare_data.sh --stage 1 --stop_stage 10 --data ${data} \ + --sre16_unlab_dir ${sre16_unlab_dir} --sre16_evalset_dir ${sre16_evalset_dir} --sre16_evalset_keys ${sre16_evalset_keys} \ + --sre18_devset_dir ${sre18_devset_dir} --sre18_evalset_dir ${sre18_evalset_dir} --sre18_evalset_keys ${sre18_evalset_keys} \ + --sre21_devset_dir ${sre21_devset_dir} --sre21_evalset_dir ${sre21_evalset_dir} --sre21_evalset_keys ${sre21_evalset_keys} \ + --cts_superset_dir ${cts_superset_dir} --voxceleb_dir ${voxceleb_dir} +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + + true && { + echo "Convert train data to ${data_type}..." + for dset in cts_vox; do + python tools/make_shard_list.py --num_utts_per_shard 1000 \ + --num_threads 12 \ + --prefix shards \ + --shuffle \ + --vad_file ${data}/$dset/vad \ + ${data}/$dset/wav.scp ${data}/$dset/utt2spk \ + ${data}/$dset/shards ${data}/$dset/shard.list + done + } + + true && { + echo "Convert data for PLDA backend training and evaluation to raw format..." + if [ $aug_plda_data = 0 ];then + sre_plda_data=cts + else + sre_plda_data=cts_aug + fi + + # Raw format for backend and evaluation data + for dset in ${sre_plda_data} sre16/major sre16/eval/enrollment sre16/eval/test \ + sre18/dev/enrollment sre18/dev/test sre18/dev/unlabeled sre18/eval/enrollment sre18/eval/test \ + sre21/dev/enrollment sre21/dev/test sre21/eval/enrollment sre21/eval/test;do + + # The below requires utt2spk to be present. So create a "dummy" one if we don't have it. + # This is for example the case with sre21 eval data. + if [ ! -f $data/$dset/utt2spk ];then + awk '{print $1 " unk"}' ${data}/${dset}/wav.scp > ${data}/${dset}/utt2spk + fi + + python tools/make_raw_list.py --vad_file ${data}/$dset/vad \ + ${data}/$dset/wav.scp \ + ${data}/$dset/utt2spk ${data}/$dset/raw.list + done + } + + true && { + # Convert all musan and rirs data to LMDB if they don't already exist. + for x in rirs musan;do + if [ ! -d $data/$x/lmdb ];then + python tools/make_lmdb.py ${data}/$x/wav.scp ${data}/$x/lmdb + fi + done + } + +fi + + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "Start training ..." + if [ ! -z $num_gpus_train ];then + gpus=$(python -c "from sys import argv; from safe_gpu import safe_gpu; safe_gpu.claim_gpus(int(argv[1])); print( safe_gpu.gpu_owner.devices_taken )" $num_gpus_train | sed "s: ::g") + else + num_gpus=$(echo $gpus | awk -F ',' '{print NF}') + fi + echo "Using $num_gpus_train GPUs: $gpus" + #torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus_train \ # The below is to prevent problems if many jobs run on the same machine + torchrun --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$((RANDOM)) --nnodes=1 --nproc_per_node=$num_gpus_train \ + wespeaker/bin/train.py --config $config \ + --exp_dir ${exp_dir} \ + --gpus $gpus \ + --num_avg ${num_avg} \ + --data_type "${data_type}" \ + --train_data ${data}/cts_vox/${data_type}.list \ + --train_label ${data}/cts_vox/utt2spk \ + --reverb_data ${data}/rirs/lmdb \ + --noise_data ${data}/musan/lmdb \ + ${checkpoint:+--checkpoint $checkpoint} +fi + + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + + false && { + echo "Do model average ..." + avg_model=$exp_dir/models/avg_model.pt + python wespeaker/bin/average_model.py \ + --dst_model $avg_model \ + --src_path $exp_dir/models \ + --num ${num_avg} + + model_path=$avg_model + if [[ $config == *repvgg*.yaml ]]; then + echo "convert repvgg model ..." + python wespeaker/models/convert_repvgg.py \ + --config $exp_dir/config.yaml \ + --load $avg_model \ + --save $exp_dir/models/convert_model.pt + model_path=$exp_dir/models/convert_model.pt + fi + } + + avg_model=$exp_dir/models/avg_model.pt + model_path=$avg_model + + echo "Extract embeddings ..." + avg_model=$exp_dir/models/avg_model.pt + model_path=$avg_model + gpus=$(python -c "from sys import argv; from safe_gpu import safe_gpu; safe_gpu.claim_gpus(int(argv[1])); print( safe_gpu.gpu_owner.devices_taken )" $num_gpus_extract | sed "s: ::g" ) + echo $gpus + local/extract_sre.sh \ + --exp_dir $exp_dir --model_path $model_path \ + --nj $num_gpus_extract --gpus $gpus --data_type raw --data ${data} \ + --reverb_data ${data}/rirs/lmdb \ + --noise_data ${data}/musan/lmdb \ + --aug_plda_data ${aug_plda_data} +fi + + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + echo "Export the final model ..." + python wespeaker/bin/export_jit.py \ + --config $exp_dir/config.yaml \ + --checkpoint $exp_dir/models/avg_model.pt \ + --output_file $exp_dir/models/final.zip +fi + + +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + echo "### --- Score using Cosine Distance --- ###" + + # Use SRE16 unlabeled data for mean subraction + echo "### --- Mean: SRE16 unlabeled ("SRE16 Major") --- ###" + true && { + for dset in sre16_eval;do + echo " * $dset" + local/score.sh \ + --stage 1 --stop-stage 2 \ + --trials ${trials[$dset]} \ + --xvectors $exp_dir/embeddings/${xvectors[$dset]} \ + --cal_mean_dir ${exp_dir}/embeddings/sre16/major \ + --exp_dir $exp_dir + done + } + + # Use SRE18 unlabeled data for mean subraction + echo "### --- Mean: SRE18 Unlabeled --- ###" + true && { + for dset in sre18_eval sre18_dev;do + echo " * $dset" + local/score.sh \ + --stage 1 --stop-stage 2 \ + --trials ${trials[$dset]} \ + --xvectors $exp_dir/embeddings/${xvectors[$dset]} \ + --cal_mean_dir ${exp_dir}/embeddings/sre18/dev/unlabeled \ + --exp_dir $exp_dir + done + } + + # Use backend training data for mean subraction + echo "### --- Mean: SRE --- ###" + true && { + for dset in sre16_eval sre18_eval sre18_dev sre21_eval sre21_dev;do + echo " * $dset" + local/score.sh \ + --stage 1 --stop-stage 2 \ + --trials ${trials[$dset]} \ + --xvectors $exp_dir/embeddings/${xvectors[$dset]} \ + --cal_mean_dir ${exp_dir}/embeddings/cts_aug \ + --exp_dir $exp_dir + done + } + +fi + + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + echo "### --- Score with PLDA --- ###" + echo "### --- Mean: PLDA training set (cts_aug) --- ###" + + # Here we specify the embedding preprocessing to be used before backend modelling/scoring. + mean1_scp=${exp_dir}/embeddings/${sre_plda_data}/cts_aug/xvector.scp + lda_scp=${exp_dir}/embeddings/${sre_plda_data}/cts_aug/xvector.scp + utt2spk=${data}/cts_aug/utt2spk + lda_dim=100 + preprocessing_chain="mean-subtract --scp $mean1_scp | length-norm | lda --scp $lda_scp --utt2spk $utt2spk --dim $lda_dim | length-norm" + preprocessing_path_cts_aug=${exp_dir}/embd_proc_cts_aug.pkl + + # Run stage 1-6 here to train the embedding preprocessing chain and the PLDA model as well + # as to evaluate SRE16 which is the default set to evaluate if no eval set is provided. + true && { + local/score_plda.sh \ + --stage 1 --stop-stage 6 \ + --data ${data} \ + --exp_dir $exp_dir \ + --aug_plda_data ${aug_plda_data} \ + --preprocessing_chain "$preprocessing_chain" \ + --preprocessing_path "$preprocessing_path_cts_aug" + } + # Score the other sets. We need only stage 4-6 for this. + true && { + for dset in sre18_eval sre18_dev sre21_eval sre21_dev;do + local/score_plda.sh \ + --stage 4 --stop-stage 6 \ + --data ${data} \ + --exp_dir $exp_dir \ + --enroll_scp ${enr_scp[$dset]} \ + --test_scp ${test_scp[$dset]} \ + --aug_plda_data ${aug_plda_data} \ + --preprocessing_path "$preprocessing_path" \ + --preprocessing_path "$preprocessing_path_cts_aug" \ + --utt2spk ${utt2mdl[$dset]} \ + --trials ${trials[$dset]} + done + } + + # Score using SRE 16 unlab mean. We should not retrain the backend again, i.e. stage 2-3 + # but we do need to update the embedding preprocessing chain. + mean1_scp=${exp_dir}/embeddings/sre16/major/xvector.scp + new_link="mean-subtract --scp $mean1_scp " + preprocessing_path_sre16_major=${exp_dir}/embd_proc_sre16_major.pkl + + # The following command replaces link 0 (cts_aug mean subtraction) with a new link (sre16 major mean subtraction) + python wespeaker/bin/update_embd_proc.py --in_path $preprocessing_path_cts_aug --out_path $preprocessing_path_sre16_major --link_no_to_remove 0 --new_link "$new_link" + + echo "### --- Mean: SRE16 Major --- ###" + true && { + local/score_plda.sh \ + --stage 4 --stop-stage 6 \ + --data ${data} \ + --exp_dir $exp_dir \ + --preprocessing_path "$preprocessing_path_sre16_major" + } + + # Similarly for SRE18 + mean1_scp=${exp_dir}/embeddings/sre18/dev/unlabeled/xvector.scp + new_link="mean-subtract --scp $mean1_scp " + preprocessing_path_sre18_unlab=${exp_dir}/embd_proc_sre18_dev_unlabeled.pkl + + python wespeaker/bin/update_embd_proc.py --in_path $preprocessing_path_cts_aug --out_path $preprocessing_path_sre18_unlab --link_no_to_remove 0 --new_link "$new_link" + + echo "### --- Mean: SRE18 Unlabeled --- ###" + true && { + for dset in sre18_eval sre18_dev;do + local/score_plda.sh \ + --stage 4 --stop-stage 6 \ + --data ${data} \ + --exp_dir $exp_dir \ + --preprocessing_path "$preprocessing_path_sre18_unlab" \ + --enroll_scp ${enr_scp[$dset]} \ + --test_scp ${test_scp[$dset]} \ + --utt2spk ${utt2mdl[$dset]} \ + --trials ${trials[$dset]} + done + } +fi + + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + + echo "Score with adapted PLDA ..." + + # --indomain_scp is by default sre16/major/xvector.scp in local/score_plda_adapt.sh. + # It is used for adaptation. Note that in other recipes, indomain_scp is passed to + # wespeaker/bin/eval_plda.py insdide local/score_plda_adapt.sh in which case it will + # be used for mean subtraction before scoring. In this recipe, mean subtraction is, + # however, part of the backend preprocessing chain and is therefore not used in + # wespeaker/bin/eval_plda.py. + + echo "### --- Mean: SRE16 Major --- ###" + true && { + local/score_plda_adapt.sh \ + --stage 1 --stop-stage 4 \ + --data ${data} \ + --exp_dir $exp_dir \ + --preprocessing_path ${exp_dir}/embd_proc_sre16_major.pkl \ + --aug_plda_data ${aug_plda_data} + } + + preprocessing_path_sre18_unlab=${exp_dir}/embd_proc_sre18_dev_unlabeled.pkl + echo "### --- Mean: SRE18 Unlabeled --- ###" + # Stage 1 is only needed to be run once per domain so we could have set stage 1-4 for + # sre18_eval and stage 1,3,4 for sre18_dev but since stage 2 is very fast we keep it + # in order to keep the script clean. + true && { + for dset in sre18_eval sre18_dev;do + + local/score_plda_adapt.sh \ + --stage 1 --stop-stage 4 \ + --data ${data} \ + --exp_dir $exp_dir \ + --aug_plda_data ${aug_plda_data} \ + --enroll_scp ${enr_scp[$dset]} \ + --test_scp ${test_scp[$dset]} \ + --preprocessing_path "$preprocessing_path_sre18_unlab" \ + --indomain_scp sre18/dev/unlabeled/xvector.scp \ + --utt2spk ${utt2mdl[$dset]} \ + --trials ${trials[$dset]} + done + } +fi + + +if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then + echo "### --- Score using Cosine Distance --- ###" + + # The preprocessed embeddings are already stored but we need to create the lists as + # score.sh wants them. This is a bit messy and therefore kept in a separate script. + ./local/create_preproc_embd_lists.sh $exp_dir + + # Note that cal_mean_dir should not be provided since the embedding preprocessing includes mean subtration. + + # Use SRE16 unlabeled data for mean subraction + echo "### --- Mean: SRE16 unlabeled ("SRE16 Major") --- ###" + true && { + preproc_name=embd_proc_sre16_major + for dset in sre16_eval;do + # The xvector list for the relevant preprocessing chain. + new_xvectors=$(echo $exp_dir/embeddings/${xvectors[$dset]} | sed "s:\.scp:_proc_$preproc_name\.scp:") + echo " * $new_xvectors" + local/score.sh \ + --stage 1 --stop-stage 2 \ + --trials ${trials[$dset]} \ + --xvectors $new_xvectors \ + --exp_dir $exp_dir + done + + } + + # Use SRE18 unlabeled data for mean subraction + echo "### --- Mean: SRE18 Unlabeled --- ###" + true && { + preproc_name=embd_proc_sre18_dev_unlabeled + for dset in sre18_eval sre18_dev;do + new_xvectors=$(echo $exp_dir/embeddings/${xvectors[$dset]} | sed "s:\.scp:_proc_$preproc_name\.scp:") + echo " * $new_xvectors" + local/score.sh \ + --stage 1 --stop-stage 2 \ + --trials ${trials[$dset]} \ + --xvectors $new_xvectors \ + --exp_dir $exp_dir + done + } + + # Use backend training data for mean subraction + echo "### --- Mean: SRE --- ###" + true && { + preproc_name=embd_proc_cts_aug + for dset in sre16_eval sre18_eval sre18_dev sre21_eval sre21_dev;do + new_xvectors=$(echo $exp_dir/embeddings/${xvectors[$dset]} | sed "s:\.scp:_proc_$preproc_name\.scp:") + echo " * $new_xvectors" + local/score.sh \ + --stage 1 --stop-stage 2 \ + --trials ${trials[$dset]} \ + --xvectors $new_xvectors \ + --exp_dir $exp_dir + done + } + +fi + + + +if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then + # Summarize results + echo "" + echo "----------------------------------------------------" + echo "### --- Summary of results (EER / minDCF0.01)--- ###" + echo "----------------------------------------------------" + # Make the header + eval_data='system' + for dset in sre16_eval sre18_dev sre18_eval sre21_dev sre21_eval;do + for x in $(echo ${trials[$dset]} | tr "," " "); do + xx=$(basename $x) + eval_data="$eval_data, $xx " + done + done + echo $eval_data > results_summary.txt + # Collect the results + for sys in mean_cts_aug_cos mean_sre16_major_cos mean_sre18_dev_unlabeled_cos \ + proc_embd_proc_cts_aug_cos proc_embd_proc_sre16_major_cos proc_embd_proc_sre18_dev_unlabeled_cos \ + proc_embd_proc_cts_aug_plda proc_embd_proc_sre16_major_plda proc_embd_proc_sre18_dev_unlabeled_plda \ + proc_embd_proc_sre16_major_plda_adapt proc_embd_proc_sre18_dev_unlabeled_plda_adapt;do + res="$sys," + for dset in sre16_eval sre18_dev sre18_eval sre21_dev sre21_eval;do + for x in $(echo ${trials[$dset]} | tr "," " "); do + xx=$(basename $x) + eval_data="$eval_data $xx " + if [ -e ${exp_dir}/scores/${xx}.${sys}.result ];then + res="$res $(grep EER ${exp_dir}/scores/${xx}.${sys}.result | sed 's:.* = ::')" + res="$res / $(grep minDCF ${exp_dir}/scores/${xx}.${sys}.result | sed 's:.* = ::')," + else + res="$res - -," + fi + done + done + echo -e $res >> results_summary.txt + done + column -t -s"," results_summary.txt + echo "" + echo "-------------------------------------------------------" + echo "### --- CSV for copy-paste to google sheet etc. --- ###" + echo "-------------------------------------------------------" + tail -n+2 results_summary.txt | sed "s:/:,:g" | sed "s: :,:g"| sed -r "s:,+:,:g" + + +fi diff --git a/examples/sre/v3/tools b/examples/sre/v3/tools new file mode 120000 index 00000000..c92f4172 --- /dev/null +++ b/examples/sre/v3/tools @@ -0,0 +1 @@ +../../../tools \ No newline at end of file diff --git a/examples/sre/v3/wespeaker b/examples/sre/v3/wespeaker new file mode 120000 index 00000000..900c560b --- /dev/null +++ b/examples/sre/v3/wespeaker @@ -0,0 +1 @@ +../../../wespeaker \ No newline at end of file diff --git a/tools/combine_data.sh b/tools/combine_data.sh new file mode 100755 index 00000000..12e2e3cd --- /dev/null +++ b/tools/combine_data.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# 2014 David Snyder + +# This script combines the data from multiple source directories into +# a single destination directory. + +# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information +# about what these directories contain. + +# Begin configuration section. +extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." +skip_fix=false # skip the fix_data_dir.sh in the end +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. tools/parse_options.sh || exit 1; + +if [ $# -lt 2 ]; then + echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." + echo "Note, files that don't appear in all source dirs will not be combined," + echo "with the exception of utt2uniq and segments, which are created where necessary." + exit 1 +fi + +dest=$1; +shift; + +first_src=$1; + +rm -r $dest 2>/dev/null || true +mkdir -p $dest; + +export LC_ALL=C + +for dir in $*; do + if [ ! -f $dir/utt2spk ]; then + echo "$0: no such file $dir/utt2spk" + exit 1; + fi +done + +# Check that frame_shift are compatible, where present together with features. +dir_with_frame_shift= +for dir in $*; do + if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then + if [[ $dir_with_frame_shift ]] && + ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then + echo "$0:error: different frame_shift in directories $dir and " \ + "$dir_with_frame_shift. Cannot combine features." + exit 1; + fi + dir_with_frame_shift=$dir + fi +done + +# W.r.t. utt2uniq file the script has different behavior compared to other files +# it is not compulsary for it to exist in src directories, but if it exists in +# even one it should exist in all. We will create the files where necessary +has_utt2uniq=false +for in_dir in $*; do + if [ -f $in_dir/utt2uniq ]; then + has_utt2uniq=true + break + fi +done + +if $has_utt2uniq; then + # we are going to create an utt2uniq file in the destdir + for in_dir in $*; do + if [ ! -f $in_dir/utt2uniq ]; then + # we assume that utt2uniq is a one to one mapping + cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' + else + cat $in_dir/utt2uniq + fi + done | sort -k1 > $dest/utt2uniq + echo "$0: combined utt2uniq" +else + echo "$0 [info]: not combining utt2uniq as it does not exist" +fi +# some of the old scripts might provide utt2uniq as an extrafile, so just remove it +extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") + +# segments are treated similarly to utt2uniq. If it exists in some, but not all +# src directories, then we generate segments where necessary. +has_segments=false +for in_dir in $*; do + if [ -f $in_dir/segments ]; then + has_segments=true + break + fi +done + +if $has_segments; then + for in_dir in $*; do + if [ ! -f $in_dir/segments ]; then + echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 + tools/data/get_segments_for_data.sh $in_dir + else + cat $in_dir/segments + fi + done | sort -k1 > $dest/segments + echo "$0: combined segments" +else + echo "$0 [info]: not combining segments as it does not exist" +fi + +for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do + exists_somewhere=false + absent_somewhere=false + for d in $*; do + if [ -f $d/$file ]; then + exists_somewhere=true + else + absent_somewhere=true + fi + done + + if ! $absent_somewhere; then + set -o pipefail + ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; + set +o pipefail + echo "$0: combined $file" + else + if ! $exists_somewhere; then + echo "$0 [info]: not combining $file as it does not exist" + else + echo "$0 [info]: **not combining $file as it does not exist everywhere**" + fi + fi +done + +tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt + +if [[ $dir_with_frame_shift ]]; then + cp $dir_with_frame_shift/frame_shift $dest +fi + +if ! $skip_fix ; then + tools/fix_data_dir.sh $dest || exit 1; +fi + +exit 0 diff --git a/tools/copy_data_dir.sh b/tools/copy_data_dir.sh new file mode 100755 index 00000000..c4cd4db6 --- /dev/null +++ b/tools/copy_data_dir.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash + +# Copyright 2023 Brno University of Techology (author: Johan Rohdin) +# Apache 2.0 + +# Copies wav.scp as well as utt2spk spk2utt if they are available. The script +# can also take a list of speakers or utterances to keep. If provided, only +# utterances/speakers in the list are kept. + +src_dir=$1 +dest_dir=$2 + +shift 2 + +update_wav_path=false +utt_list="" +spk_list="" + +. tools/parse_options.sh || exit 1 + +if [ "$dest_dir" == "$src_dir" ]; then + echo "$0 ERROR: Input directory () and output directory () are the same." + exit 1 +fi + +mkdir -p $dest_dir + + +if [ ! -z "$utt_list" ]; then + echo "UTTLIST" +fi +if [ ! -z "$spk_list" ]; then + echo "SPKLIST" +fi + + + + +#if [ $utt_list != "" ] && [ $spk_list != "" ]; then +if [ ! -z "$utt_list" ] && [ ! -z "$spk_list" ]; then + echo "$0 ERROR: Providing both utt_list and spk_list not supported." + exit 1 +fi + + + +if [ ! -f $src_dir/utt2spk ]; then + echo "$0 WARNING: copy_data_dir.sh: no such file $src_dir/utt2spk" +else + if [ ! -z "$utt_list" ];then + awk 'NR==FNR{a[$1];next}$1 in a{print $0}' $utt_list $src_dir/utt2spk > $dest_dir/utt2spk + elif [ ! -z "$spk_list" ];then + #echo "A" + awk 'NR==FNR{a[$1];next}$2 in a{print $0}' $spk_list $src_dir/utt2spk > $dest_dir/utt2spk + else + cp $src_dir/utt2spk $dest_dir/utt2spk + fi +fi + + +if [ ! -f $src_dir/spk2utt ]; then + echo "$0 WARNING: copy_data_dir.sh: no such file $src_dir/spk2utt" +else + if [ ! -z "$utt_list" ];then + # This will work even if utt2spk doesn't exist and was simpler than reducing spk2utt directly. + cat $scrdir/spk2utt | tools/spk2utt_to_utt2spk.pl \ + | awk 'NR==FNR{a[$1];next}$1 in a{print $0}' $utt_list - \ + | tools/utt2spk_to_spk2utt.pl > $dest_dir/spk2utt + + elif [ ! -z "$spk_list" ];then + awk 'NR==FNR{a[$1];next}$1 in a{print $0}' $spk_list $src_dir/spk2utt > $dest_dir/spk2utt + else + cp $src_dir/spk2utt $dest_dir/spk2utt + fi +fi + + +if [ ! -f $src_dir/wav.scp ]; then + echo "$0 ERROR: copy_data_dir.sh: no such file $src_dir/wav.scp" + exit 1; +else + if [ $update_wav_path == true ];then + src_root_dir=$(readlink -f $src_dir | sed "s:data/.*::") + dest_root_dir=$(readlink -f $dest_dir | sed "s:data/.*::") + cat $src_dir/wav.scp | sed "s:$src_root_dir:$dest_root_dir:" > $dest_dir/wav.scp + else + cp $src_dir/wav.scp $dest_dir/wav.scp + fi +fi + + +# Sanity checks +if [ -f $dest_dir/utt2spk ];then + if [ $( wc -l $dest_dir/utt2spk | cut -f1 -d" ") -ne $( wc -l $dest_dir/wav.scp | cut -f1 -d" " ) ];then + echo "ERROR: Length of utt2spk and wav.scp doesn't match." + exit 1 + fi + if [ -f $src_dir/spk2utt ]; then + if [ $( cat $dest_dir/utt2spk | sort | md5sum | cut -f1 -d" " ) != $( tools/spk2utt_to_utt2spk.pl $dest_dir/spk2utt | sort | md5sum | cut -f1 -d" " ) ];then + echo "ERROR: utt2spk and spk2utt doesn't match." + exit 1 + fi + fi +fi + + + + diff --git a/tools/make_shard_list.py b/tools/make_shard_list.py index 12970297..809410f6 100644 --- a/tools/make_shard_list.py +++ b/tools/make_shard_list.py @@ -197,10 +197,17 @@ def main(): if vad_dict is None: data.append((key, spk, wav)) else: + """ if key not in vad_dict: continue vad = vad_dict[key] data.append((key, spk, wav, vad)) + """ + if key not in vad_dict: + data.append((key, spk, wav)) + else: + vad = vad_dict[key] + data.append((key, spk, wav, vad)) if args.shuffle: random.shuffle(data) diff --git a/tools/subset_data_dir.sh b/tools/subset_data_dir.sh new file mode 100755 index 00000000..7ae0c663 --- /dev/null +++ b/tools/subset_data_dir.sh @@ -0,0 +1,192 @@ +#!/usr/bin/env bash +# Copyright 2010-2011 Microsoft Corporation +# 2012-2013 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + + +# This script operates on a data directory, such as in data/train/. +# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data +# for what these directories contain. + +# This script creates a subset of that data, consisting of some specified +# number of utterances. (The selected utterances are distributed evenly +# throughout the file, by the program ./subset_scp.pl). + +# There are six options, none compatible with any other. + +# If you give the --per-spk option, it will attempt to select the supplied +# number of utterances for each speaker (typically you would supply a much +# smaller number in this case). + +# If you give the --speakers option, it selects a subset of n randomly +# selected speakers. + +# If you give the --shortest option, it will give you the n shortest utterances. + +# If you give the --first option, it will just give you the n first utterances. + +# If you give the --last option, it will just give you the n last utterances. + +# If you give the --spk-list or --utt-list option, it reads the +# speakers/utterances to keep from /" (note, +# in this case there is no positional parameter; see usage message.) + + +shortest=false +perspk=false +speakers=false +first_opt= +spk_list= +utt_list= + +expect_args=3 +case $1 in + --first|--last) first_opt=$1; shift ;; + --per-spk) perspk=true; shift ;; + --shortest) shortest=true; shift ;; + --speakers) speakers=true; shift ;; + --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; + --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; + --*) echo "$0: invalid option '$1'"; exit 1 +esac + +if [ $# != $expect_args ]; then + echo "Usage:" + echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " + echo " subset_data_dir.sh [--spk-list ] " + echo " subset_data_dir.sh [--utt-list ] " + echo "By default, randomly selects utterances from the data directory." + echo "With --speakers, randomly selects enough speakers that we have utterances" + echo "With --per-spk, selects utterances per speaker, if available." + echo "With --first, selects the first utterances" + echo "With --last, selects the last utterances" + echo "With --shortest, selects the shortest utterances." + echo "With --spk-list, reads the speakers to keep from " + echo "With --utt-list, reads the utterances to keep from " + exit 1; +fi + +srcdir=$1 +if [[ $spk_list || $utt_list ]]; then + numutt= + destdir=$2 +else + numutt=$2 + destdir=$3 +fi + +export LC_ALL=C + +if [ ! -f $srcdir/utt2spk ]; then + echo "$0: no such file $srcdir/utt2spk" + exit 1 +fi + +if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then + echo "$0: cannot subset to more utterances than you originally had." + exit 1 +fi + +if $shortest && [ ! -f $srcdir/feats.scp ]; then + echo "$0: you selected --shortest but no feats.scp exist." + exit 1 +fi + +mkdir -p $destdir || exit 1 + +if [[ $spk_list ]]; then + tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; + tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; +elif [[ $utt_list ]]; then + tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; + tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; +elif $speakers; then + tools/shuffle_list.pl < $srcdir/spk2utt | + awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | + sort > $destdir/spk2utt + tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk +elif $perspk; then + awk '{ n='$numutt'; printf("%s ",$1); + skip=1; while(n*(skip+1) <= NF-1) { skip++; } + for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } + printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt + tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk +else + if $shortest; then + # Select $numutt shortest utterances. + . ./path.sh + feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; + sort -n -k2 $destdir/tmp.len | + awk '{print $1}' | + head -$numutt >$destdir/tmp.uttlist + tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk + rm $destdir/tmp.uttlist $destdir/tmp.len + else + # Select $numutt random utterances. + tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; + fi + tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt +fi + +# Perform filtering. utt2spk and spk2utt files already exist by this point. +# Filter by utterance. +[ -f $srcdir/feats.scp ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp +[ -f $srcdir/vad.scp ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp +[ -f $srcdir/utt2lang ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang +[ -f $srcdir/utt2dur ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur +[ -f $srcdir/utt2num_frames ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames +[ -f $srcdir/utt2uniq ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq +[ -f $srcdir/wav.scp ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp +[ -f $srcdir/utt2warp ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp +[ -f $srcdir/text ] && + tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text + +# Filter by speaker. +[ -f $srcdir/spk2warp ] && + tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp +[ -f $srcdir/spk2gender ] && + tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender +[ -f $srcdir/cmvn.scp ] && + tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp + +# Filter by recording-id. +if [ -f $srcdir/segments ]; then + tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments + # Recording-ids are in segments. + awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco + # The next line overrides the command above for wav.scp, which would be incorrect. + [ -f $srcdir/wav.scp ] && + tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp +else + # No segments; recording-ids are in wav.scp. + awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco +fi + +[ -f $srcdir/reco2file_and_channel ] && + tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel +[ -f $srcdir/reco2dur ] && + tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur + +# Filter the STM file for proper sclite scoring. +# Copy over the comments from STM file. +[ -f $srcdir/stm ] && + (grep "^;;" $srcdir/stm + tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm + +rm $destdir/reco + +# Copy frame_shift if present. +[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir + +srcutts=$(wc -l <$srcdir/utt2spk) +destutts=$(wc -l <$destdir/utt2spk) +echo "$0: reducing #utt from $srcutts to $destutts" +exit 0 diff --git a/wespeaker/bin/apply_embd_proc.py b/wespeaker/bin/apply_embd_proc.py new file mode 100644 index 00000000..a7149ed4 --- /dev/null +++ b/wespeaker/bin/apply_embd_proc.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import kaldiio +import numpy as np +from wespeaker.utils.embedding_processing import EmbeddingProcessingChain + +if __name__ == '__main__': + """ + xxx + """ + parser = argparse.ArgumentParser() + parser.add_argument('--path', + type=str, + default='', + help='Path to processing chain.') + parser.add_argument('--input', + type=str, + default='', + help='Input scp file.') + parser.add_argument('--output', + type=str, + default='', + help='Output scp/ark file.') + args = parser.parse_args() + + processingChain = EmbeddingProcessingChain() + processingChain.load(args.path) + + embd = [] + utt = [] + for k, v in kaldiio.load_scp_sequential(args.input): + utt.append(k) + embd.append(v) + embd = np.array(embd) + utt = np.array(utt) + + print("Read {} embeddings of dimension {}.".format(embd.shape[0], + embd.shape[1])) + + embd = processingChain(embd) + + # Store both ark and scp if extention '.ark,scp' or '.scp,ark'. Or, only + # ark if extension is '.ark' + output_file = args.output + if output_file.endswith('ark,scp') or output_file.endswith('scp,ark'): + output_file = output_file.rstrip('ark,scp') + output_file = output_file.rstrip('scp,ark') + with kaldiio.WriteHelper('ark,scp:' + output_file + "ark," + + output_file + 'scp') as writer: + for i, u in enumerate(utt): + e = embd[i] + writer(u, e) + + elif output_file.endswith('ark'): + with kaldiio.WriteHelper('ark:' + output_file) as writer: + for i, u in enumerate(utt): + e = embd[i] + writer(u, e) + else: + raise Exception( + "Invalid file extension of output file {}".format(output_file)) + + print("Wrote {} embeddings of dimension {}.".format( + embd.shape[0], embd.shape[1])) diff --git a/wespeaker/bin/eval_plda.py b/wespeaker/bin/eval_plda.py index 1481e874..13faaf5a 100644 --- a/wespeaker/bin/eval_plda.py +++ b/wespeaker/bin/eval_plda.py @@ -37,9 +37,14 @@ type=str, help='score file to write to') parser.add_argument('--trial', type=str, help='trial file to score upon') + parser.add_argument('--multisession_avg', default=False, action="store_true", + help='Whether to score multisession by average instead ' + 'of by-the-book. Default False.') + args = parser.parse_args() kaldi_format = True if args.type == 'kaldi' else False plda = TwoCovPLDA.load_model(args.model_path, kaldi_format) plda.eval_sv(args.enroll_scp_path, args.utt2spk, args.test_scp_path, - args.trial, args.score_path, args.indomain_scp_path) + args.trial, args.score_path, args.multisession_avg, + args.indomain_scp_path) diff --git a/wespeaker/bin/prep_embd_proc.py b/wespeaker/bin/prep_embd_proc.py new file mode 100644 index 00000000..445eab08 --- /dev/null +++ b/wespeaker/bin/prep_embd_proc.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from wespeaker.utils.embedding_processing import EmbeddingProcessingChain + +if __name__ == '__main__': + """ + xxx + """ + parser = argparse.ArgumentParser() + parser.add_argument('--chain', + type=str, + default='whitening | length-norm ', + help='') + parser.add_argument('--path', type=str) + args = parser.parse_args() + + processingChain = EmbeddingProcessingChain(chain=args.chain) + processingChain.save(args.path) diff --git a/wespeaker/bin/score.py b/wespeaker/bin/score.py index b65209de..d91153e8 100644 --- a/wespeaker/bin/score.py +++ b/wespeaker/bin/score.py @@ -73,6 +73,8 @@ def trials_cosine_score(eval_scp_path='', def main(exp_dir, eval_scp_path, cal_mean, cal_mean_dir, *trials): + + print(cal_mean) if not cal_mean: print("Do not do mean normalization for evaluation embeddings.") mean_vec_path = None diff --git a/wespeaker/bin/update_embd_proc.py b/wespeaker/bin/update_embd_proc.py new file mode 100644 index 00000000..d8a8d7c5 --- /dev/null +++ b/wespeaker/bin/update_embd_proc.py @@ -0,0 +1,45 @@ +# Copyright (c) 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from wespeaker.utils.embedding_processing import EmbeddingProcessingChain + +if __name__ == '__main__': + """ + xxx + """ + parser = argparse.ArgumentParser() + parser.add_argument('--in_path', + type=str, + default='', + help='Path where to load original processing chain.') + parser.add_argument('--out_path', + type=str, + default='', + help='Path where to save updated processing chain.') + parser.add_argument('--link_no_to_remove', + type=int, + default='', + help='Input scp file.') + parser.add_argument( + '--new_link', + type=str, + default='', + help='new link, e.g., "mean-subtract --scp new_scp_for_mean.scp".') + args = parser.parse_args() + + processingChain = EmbeddingProcessingChain() + processingChain.load(args.in_path) + processingChain.update_link(args.link_no_to_remove, args.new_link) + processingChain.save(args.out_path) diff --git a/wespeaker/utils/embedding_processing.py b/wespeaker/utils/embedding_processing.py new file mode 100644 index 00000000..7595ffac --- /dev/null +++ b/wespeaker/utils/embedding_processing.py @@ -0,0 +1,271 @@ +# Copyright (c) 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import kaldiio +import pickle +import scipy.linalg as spl +import numpy as np +from wespeaker.utils.plda.plda_utils import get_data_for_plda + + +def chain_string_to_dict(chain_string=None): + # This function converts an input string into a list and dictionary + # structure suitable for use by the embedding processing classes below. + # For example, + # "mean-subtract --scp mean1_xvector.scp | length-norm " | + # "| lda --scp lda_xvector.scp --utt2spk utt2spk --dim $lda_dim " + # "| length-norm" + # (The above three lines is supposed to be one long string but style + # rules prevents it from be written that way here.) + # becomes + # [ + # ['mean-subtract', {'scp': 'mean1_xvector.scp'}], + # ['length-norm', {}], + # ['lda', {'scp': 'lda_xvector.scp', + # 'utt2spk': 'utt2spk', + # 'dim': '100'}], + # ['length-norm', {}] + # ] + + if chain_string is not None: + links = chain_string.split('|') + else: + links = [] + + a = [] + for l in links: + + x = l.split('--') + method = x.pop(0) + method = method.lstrip(' ') + method = method.rstrip(' ') + + args_and_values = {} + for xx in x: + xx = re.sub("=", " ", xx) + xx = re.sub(" +", " ", xx) + xx = xx.lstrip(' ') + xx = xx.rstrip(' ') + xx = xx.split(' ') + assert len(xx) == 2 + args_and_values[xx[0]] = xx[1] + + a.append([method, args_and_values]) + + return a + + +class Lda: + + def compute_mean_and_lda_scatter_matrices(self, + scp_file, + utt2spk_file, + equal_speaker_weight=False, + current_chain=None): + # equal_speaker_weight: If True, each speaker is considered equally + # important in the calculation of the mean and scatter matrices. If + # False, speakers are weighted by their number of utterances. + if current_chain is None: + current_chain = [] + _, embeddings_dict = get_data_for_plda(scp_file, utt2spk_file) + speakers = embeddings_dict.keys() + speaker_counts = [] + speaker_means = [] + speaker_covs = [] + n_used = 0 + n_skipped = 0 + for s in speakers: + embd_s = current_chain(np.vstack(embeddings_dict[s])) + count_s = embd_s.shape[0] + # With bias=False we need at least 2 speakers, with bias=True we + # need at least 1. But this would result in covariance matrix = 0 + # for all its elements. (This is not necessarily wrong). + if count_s > 1: + mean_s = np.mean(embd_s, axis=0) + cov_s = np.cov(embd_s, rowvar=False, bias=True) + n_used += 1 + speaker_counts.append(count_s) + speaker_means.append(mean_s) + speaker_covs.append(cov_s) + + else: + n_skipped += 1 + + speaker_counts = np.array(speaker_counts) + speaker_means = np.vstack(speaker_means) + speaker_covs = np.array(speaker_covs) + print( + " #speakers: {}, #used {}, #skipped {} (only having one utterances)" + .format(len(speakers), n_used, n_skipped)) + + if equal_speaker_weight: + mean = np.mean(speaker_means, axis=0) + between_class_covariance = np.cov(speaker_means, + rowvar=False, + bias=True) + within_class_covariance = np.sum(speaker_covs, + axis=0) / len(speakers) + else: + mean = np.sum(speaker_counts[:, np.newaxis] * speaker_means, + axis=0) / np.sum(speaker_counts) + between_class_covariance = np.cov(speaker_means, + rowvar=False, + bias=True, + fweights=speaker_counts) + within_class_covariance = np.sum( + speaker_counts[:, np.newaxis, np.newaxis] * speaker_covs, + axis=0) / np.sum(speaker_counts) + + return mean, between_class_covariance, within_class_covariance + + def __init__(self, args, current_chain=None): + if current_chain is None: + current_chain = [] + + print(" LDA") + scp_file = args['scp'] + utt2spk_file = args['utt2spk'] + dim = int(args['dim']) + eps = float(args['eps']) if 'eps' in args else 1e-6 + + self.m, BC, WC = self.compute_mean_and_lda_scatter_matrices( + scp_file, utt2spk_file, current_chain=current_chain) + + E, M = spl.eigh(WC) + # Floor the within-class covariance eigenvalues. We noticed that this + # was done in Kaldi. + E_floor = np.max(E) * eps + E[E < E_floor] = E_floor + """ + # The new within-class covariance. + WC = M.dot(np.diag(E).dot(M.T)) + D, lda = spl.eigh( BC, WC ) # The output of eigh is sorted in + self.lda = lda[:,-dim:] # ascending order so we so we kee + self.T1 = np.eye(self.m.shape[0]) # the "dim" last eigenvectors. + """ + # Since we have already found the eigen decomposition of WC, we could + # whiten it by T1 = 1 / sqrt(E), I = T1 WC T1'. So instead of solving + # spl.eigh( BC, WC ) we can apply T1 on BC and solve + # spl.eigh( T1 BC T1', T1 WC T1' ) + # = spl.eigh( T1 BC T1', I ) + # = spl.eigh( T1 BC T1') + # as follows. However, T1 then needs to be inlcluded when transforming + # the data. In either case, the result is that after LDA transform, the + # data will have white WC and diagonal BC + T1 = np.dot(np.diag(1 / np.sqrt(E)), M.T) + BC = np.dot(np.dot(T1, BC), T1.T) + D, lda = spl.eigh(BC) + self.lda = np.dot(T1.T, lda[:, -dim:]) + + print(" Input dimension: {}, output dimension: {}," + " sum of all eigenvalues {:.2f}, sum of kept eigenvalues {:.2f}". + format(len(D), dim, np.sum(D), np.sum(D[-dim:]))) + print(" All eigenvalues: {}".format(D)) + + def __call__(self, embd): + return (embd - self.m).dot(self.lda) + + +class Length_norm: + + def __init__(self, args=None, current_chain=None): + pass + + def __call__(self, embd): + embd_proc = embd.copy() + embd_proc /= np.sqrt((embd_proc**2).sum( + axis=1)[:, np.newaxis]) # This would make the lengths equal to one + """ + Todo: For Kaldi compatibility we may want to add this as option as + well as Kaldi style normalization. + embd_proc *= np.sqrt(embd_normed.shape[1]) + """ + return (embd_proc) + + +class Whitening: + + def __init__(self, args, current_chain): + pass + + +class MeanSubtraction(): + + def __init__(self, args, current_chain=None): + if current_chain is None: + current_chain = [] + + e = [] + for key, vec in kaldiio.load_scp_sequential(args['scp']): + e.append(vec) + self.mean = np.mean(current_chain(np.vstack(e)), axis=0) + + def __call__(self, embd): + return embd - self.mean + + +class EmbeddingProcessingChain: + + # This is used to map the processing steps, coming from the input + # argument as strings, into the corresponding clases. + string2class = { + 'lda': Lda, + 'length-norm': Length_norm, + 'whitening': Whitening, + 'mean-subtract': MeanSubtraction + } + + def __init__(self, chain=None): + c = chain_string_to_dict(chain) + self.chain_of_classes = [] # This is not a great name... + for m, a in c: + print("Method: {}".format(m)) + print("Argument: {}".format(a)) + self.chain_of_classes.append(self.string2class[m](a, self)) + + def __call__(self, embd): + for c in self.chain_of_classes: + embd = c(embd) + return embd + + def save(self, path, data_format='pickle'): + print("Saving embedding processing chain to {}".format(path)) + with open(path, 'wb') as f: + pickle.dump(self.chain_of_classes, f) + + def load(self, path, data_format='pickle'): + print("Loading embedding processing chain from {}".format(path)) + with open(path, 'rb') as f: + self.chain_of_classes = pickle.load(f) + + def update_link(self, link_no_to_replace, new_link): + nl = chain_string_to_dict(new_link) + + # For now, it is only supported to update one link. This + # should be generalized in the future. + assert len(nl) == 1, "Length of new chain must be one." + + m, a = nl[0] + old_chain_of_classes = self.chain_of_classes + self.chain_of_classes = [] + + for i, ol in enumerate(old_chain_of_classes): + if (i != link_no_to_replace): + self.chain_of_classes.append(ol) + else: + print("Replacing link number {} ({}) with".format(i, ol)) + print("Method: {}".format(m)) + print("Argument: {}".format(a)) + self.chain_of_classes.append(self.string2class[m](a, self)) diff --git a/wespeaker/utils/plda/plda_utils.py b/wespeaker/utils/plda/plda_utils.py index 0b34bdeb..dab0923b 100644 --- a/wespeaker/utils/plda/plda_utils.py +++ b/wespeaker/utils/plda/plda_utils.py @@ -65,11 +65,16 @@ def get_data_for_plda(scp_file, utt2spk_file): model_dict = {} for key, vec in samples_dict.items(): samples.append(vec) - label = labels_dict[key] - if label in model_dict.keys(): - model_dict[label].append(vec) + if key in labels_dict: + label = labels_dict[key] + if label in model_dict.keys(): + model_dict[label].append(vec) + else: + model_dict[label] = [vec] else: - model_dict[label] = [vec] + print("WARNING: {} not in utt2spk ({}), skipping it.".format( + key, utt2spk_file)) + return np.vstack(samples), model_dict diff --git a/wespeaker/utils/plda/two_cov_plda.py b/wespeaker/utils/plda/two_cov_plda.py index b4d74317..ded9ef2b 100644 --- a/wespeaker/utils/plda/two_cov_plda.py +++ b/wespeaker/utils/plda/two_cov_plda.py @@ -1,5 +1,7 @@ # Copyright (c) 2022 Shuai Wang (wsstriving@gmail.com) # 2023 Shuai Wang, Houjun Huang +# 2024 Johan Rohdin (rohdin@fit.vutbr.cz) +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -67,7 +69,9 @@ def __init__(self, scp_file=None, utt2spk_file=None, embed_dim=256, - normalize_length=True): + subtract_train_set_mean=False, + normalize_length=False): + self.subtract_train_set_mean = subtract_train_set_mean self.normalize_length = normalize_length self.dim = embed_dim self.mu = np.zeros(self.dim) @@ -87,11 +91,15 @@ def __init__(self, if scp_file is not None: samples, self.embeddings_dict = get_data_for_plda( scp_file, utt2spk_file) - train_mean_vec = samples.mean(0) + if subtract_train_set_mean: + train_mean_vec = samples.mean(0) + else: + train_mean_vec = np.zeros(embed_dim) for key, mat in self.embeddings_dict.items(): mat = np.vstack(mat) mat = mat - train_mean_vec - mat = norm_embeddings(mat) + if self.normalize_length: + mat = norm_embeddings(mat) self.stats.add_samples(1.0, mat) self.mu = self.stats.sum_ / self.stats.class_weight @@ -155,9 +163,10 @@ def transform_embedding(self, embedding): return transformed_embedding def log_likelihood_ratio(self, transformed_train_embedding, - transformed_test_embedding): - mean = self.psi / (self.psi + 1.0) * transformed_train_embedding - variance = 1.0 + self.psi / (self.psi + 1.0) + transformed_test_embedding, n): + mean = n * self.psi / (n * self.psi + + 1.0) * transformed_train_embedding + variance = 1.0 + self.psi / (n * self.psi + 1.0) logdet = np.sum(np.log(variance)) sqdiff = transformed_test_embedding - mean sqdiff = np.power(sqdiff, 2.0) @@ -180,6 +189,7 @@ def eval_sv(self, test_scp, trials, score_file, + multisession_avg=True, indomain_scp=None): """ Caculate the plda score @@ -204,16 +214,32 @@ def eval_sv(self, enrollspks = {} testspks = {} + enrollcounts = {} for key, value in enroll_embeddings_dict.items(): + if multisession_avg: + enrollcounts[key] = 1 + else: + enrollcounts[key] = len(value) value = np.vstack(value) value = value - mean_vec # Shuai - tmp = norm_embeddings(np.mean(value, 0)) + + # Normalize length + # It is questionable whether this should be applied + # after speaker mean in case of multisession scoring. + if self.normalize_length: + tmp = norm_embeddings(np.mean(value, 0)) + + else: + tmp = np.mean(value, 0) tmp = self.transform_embedding(tmp) enrollspks[key] = tmp for key, value in test_embeddings_dict.items(): value = value - mean_vec # Shuai - tmp = norm_embeddings(value) + if self.normalize_length: + tmp = norm_embeddings(value) + else: + tmp = value tmp = self.transform_embedding(tmp) testspks[key] = tmp @@ -222,7 +248,8 @@ def eval_sv(self, for line in tqdm(read_trials): tokens = line.strip().split() score = self.log_likelihood_ratio(enrollspks[tokens[0]], - testspks[tokens[1]]) + testspks[tokens[1]], + enrollcounts[tokens[0]]) segs = line.strip().split() output_line = ('{} {} {:.5f} {}\n'.format( segs[0], segs[1], score, segs[2])) @@ -234,7 +261,8 @@ def adapt(self, adapt_scp, ac_scale=0.5, wc_scale=0.5): adp_data = np.array(list(read_vec_scp_file(adapt_scp).values())) mean_vec = adp_data.mean(0) adp_data = adp_data - mean_vec - adp_data = norm_embeddings(adp_data) + if self.normalize_length: + adp_data = norm_embeddings(adp_data) plda_mean, plda_trans, plda_psi = self.mu, self.transform, self.psi W = inv(plda_trans.T.dot(plda_trans)) @@ -303,6 +331,12 @@ def save_model(self, output_file_name): maxshape=(None), compression="gzip", fletcher32=True) + f.create_dataset("normalize_length", + data=int(self.normalize_length), + maxshape=(None)) + f.create_dataset("subtract_train_set_mean", + data=int(self.subtract_train_set_mean), + maxshape=(None)) @staticmethod def load_model(model_name, from_kaldi=False): @@ -317,4 +351,13 @@ def load_model(model_name, from_kaldi=False): plda.transform = f.get("transform")[()] plda.psi = f.get("psi")[()] plda.offset = f.get("offset")[()] + plda.normalize_length = bool(f.get("normalize_length")[()]) + plda.subtract_train_set_mean = bool( + f.get("subtract_train_set_mean")[()]) + print("PLDA normalize length is {}.".format( + plda.normalize_length)) + print("PLDA subtract_train_set_mean is {}.".format( + plda.subtract_train_set_mean)) + + plda.dim = plda.mu.shape[0] return plda