forked from k2-fsa/icefall
-
Notifications
You must be signed in to change notification settings - Fork 0
/
distillation_with_hubert.sh
executable file
·229 lines (203 loc) · 9.04 KB
/
distillation_with_hubert.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env bash
#
# A short introduction about distillation framework.
#
# A typical traditional distillation method is
# Loss(teacher embedding, student embedding).
#
# Comparing to these, the proposed distillation framework contains two mainly steps:
# codebook indexes = quantizer.encode(teacher embedding)
# Loss(codebook indexes, student embedding)
#
# Things worth to meantion:
# 1. The float type teacher embedding is quantized into a sequence of
# 8-bit integer codebook indexes.
# 2. a middle layer 36(1-based) out of total 48 layers is used to extract
# teacher embeddings.
# 3. a middle layer 6(1-based) out of total 6 layers is used to extract
# student embeddings.
#
# To directly download the extracted codebook indexes for model distillation, you can
# set stage=2, stop_stage=4, use_extracted_codebook=True
#
# To start from scratch, you can
# set stage=0, stop_stage=4, use_extracted_codebook=False
stage=0
stop_stage=4
# Set the GPUs available.
# This script requires at least one GPU.
# You MUST set environment variable "CUDA_VISIBLE_DEVICES",
# even you only have ONE GPU. It needed by CodebookIndexExtractor to determine numbert of jobs to extract codebook indexes parallelly.
# Suppose only one GPU exists:
# export CUDA_VISIBLE_DEVICES="0"
#
# Suppose GPU 2,3,4,5 are available.
# export CUDA_VISIBLE_DEVICES="0,1,2,3"
exp_dir=./pruned_transducer_stateless6/exp
mkdir -p $exp_dir
# full_libri can be "True" or "False"
# "True" -> use full librispeech dataset for distillation
# "False" -> use train-clean-100 subset for distillation
full_libri=True
# use_extracted_codebook can be "True" or "False"
# "True" -> stage 0 and stage 1 would be skipped,
# and directly download the extracted codebook indexes for distillation
# "False" -> start from scratch
use_extracted_codebook=True
# teacher_model_id can be one of
# "hubert_xtralarge_ll60k_finetune_ls960" -> fine-tuned model, it is the one we currently use.
# "hubert_xtralarge_ll60k" -> pretrained model without fintuing
teacher_model_id=hubert_xtralarge_ll60k_finetune_ls960
. shared/parse_options.sh || exit 1
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ] && [ ! "$use_extracted_codebook" == "True" ]; then
log "Stage 0: Download HuBERT model"
# Preparation stage.
# Install fairseq according to:
# https://github.com/pytorch/fairseq
# when testing this code:
# commit 806855bf660ea748ed7ffb42fe8dcc881ca3aca0 is used.
has_fairseq=$(python3 -c "import importlib; print(importlib.util.find_spec('fairseq') is not None)")
if [ $has_fairseq == 'False' ]; then
log "Please install fairseq before running following stages"
exit 1
fi
# Install quantization toolkit:
# pip install git+https://github.com/k2-fsa/multi_quantization.git
# or
# pip install multi_quantization
has_quantization=$(python3 -c "import importlib; print(importlib.util.find_spec('multi_quantization') is not None)")
if [ $has_quantization == 'False' ]; then
log "Please install multi_quantization before running following stages"
exit 1
fi
log "Download HuBERT model."
# Parameters about model.
hubert_model_dir=${exp_dir}/hubert_models
hubert_model=${hubert_model_dir}/${teacher_model_id}.pt
mkdir -p ${hubert_model_dir}
# For more models refer to: https://github.com/pytorch/fairseq/tree/main/examples/hubert
if [ -f ${hubert_model} ]; then
log "HuBERT model alread exists."
else
wget -c https://dl.fbaipublicfiles.com/hubert/${teacher_model_id}.pt -P ${hubert_model_dir}
wget -c wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P ${hubert_model_dir}
fi
fi
if [ ! -d ./data/fbank ]; then
log "This script assumes ./data/fbank is already generated by prepare.sh"
exit 1
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] && [ ! "$use_extracted_codebook" == "True" ]; then
log "Stage 1: Verify that the downloaded HuBERT model is correct."
# This stage is not directly used by codebook indexes extraction.
# It is a method to "prove" that the downloaed hubert model
# is inferenced in an correct way if WERs look like normal.
# Expect WERs:
# [test-clean-ctc_greedy_search] %WER 2.04% [1075 / 52576, 92 ins, 104 del, 879 sub ]
# [test-other-ctc_greedy_search] %WER 3.71% [1942 / 52343, 152 ins, 126 del, 1664 sub ]
./pruned_transducer_stateless6/hubert_decode.py --exp-dir $exp_dir
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
# Analysis of disk usage:
# With num_codebooks==8, each teacher embedding is quantized into
# a sequence of eight 8-bit integers, i.e. only eight bytes are needed.
# Training dataset including clean-100h with speed perturb 0.9 and 1.1 has 300 hours.
# The output frame rates of Hubert is 50 per second.
# Theoretically, 412M = 300 * 3600 * 50 * 8 / 1024 / 1024 is needed.
# The actual size of all "*.h5" files storaging codebook index is 450M.
# I think the extra "48M" usage is some meta information.
# Time consumption analysis:
# For quantizer training data(teacher embedding) extraction, only 1000 utts from clean-100 are used.
# Together with quantizer training, no more than 20 minutes will be used.
#
# For codebook indexes extraction,
# with two pieces of NVIDIA A100 gpus, around three hours needed to process 300 hours training data,
# i.e. clean-100 with speed purteb 0.9 and 1.1.
# GPU usage:
# During quantizer's training data(teacher embedding) and it's training,
# only the first ONE GPU is used.
# During codebook indexes extraction, ALL GPUs set by CUDA_VISIBLE_DEVICES are used.
if [ "$use_extracted_codebook" == "True" ]; then
if [ ! "$teacher_model_id" == "hubert_xtralarge_ll60k_finetune_ls960" ]; then
log "Currently we only uploaded codebook indexes from teacher model hubert_xtralarge_ll60k_finetune_ls960"
exit 1
fi
# The codebook indexes to be downloaded are generated using the following setup:
embedding_layer=36
num_codebooks=8
mkdir -p $exp_dir/vq
codebook_dir=$exp_dir/vq/${teacher_model_id}
mkdir -p codebook_dir
codebook_download_dir=$exp_dir/download_codebook
if [ -d $codebook_download_dir ]; then
log "$codebook_download_dir exists, you should remove it first."
exit 1
fi
log "Downloading extracted codebook indexes to $codebook_download_dir"
# Make sure you have git-lfs installed (https://git-lfs.github.com)
# The codebook indexes are generated using lhotse 1.11.0, to avoid
# potential issues, we recommend you to use lhotse version >= 1.11.0
lhotse_version=$(python3 -c "import lhotse; from packaging import version; print(version.parse(lhotse.version.__version__)>=version.parse('1.11.0'))")
if [ "$lhotse_version" == "False" ]; then
log "Expecting lhotse >= 1.11.0. This may lead to potential ID mismatch."
fi
git lfs install
git clone https://huggingface.co/marcoyang/pruned_transducer_stateless6_hubert_xtralarge_ll60k_finetune_ls960 $codebook_download_dir
vq_fbank=data/vq_fbank_layer${embedding_layer}_cb${num_codebooks}/
mkdir -p $vq_fbank
mv $codebook_download_dir/*.jsonl.gz $vq_fbank
mkdir -p $codebook_dir/splits4
mv $codebook_download_dir/*.h5 $codebook_dir/splits4/
log "Remove $codebook_download_dir"
rm -rf $codebook_download_dir
fi
./pruned_transducer_stateless6/extract_codebook_index.py \
--full-libri $full_libri \
--exp-dir $exp_dir \
--embedding-layer $embedding_layer \
--num-utts 1000 \
--num-codebooks $num_codebooks \
--max-duration 100 \
--teacher-model-id $teacher_model_id \
--use-extracted-codebook $use_extracted_codebook
if [ "$full_libri" == "True" ]; then
# Merge the 3 subsets and create a full one
rm ${vq_fbank}/librispeech_cuts_train-all-shuf.jsonl.gz
cat <(gunzip -c ${vq_fbank}/librispeech_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c ${vq_fbank}/librispeech_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c ${vq_fbank}/librispeech_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > ${vq_fbank}/librispeech_cuts_train-all-shuf.jsonl.gz
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
# Example training script.
# Note: it's better to set spec-aug-time-warpi-factor=-1
WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
./pruned_transducer_stateless6/train.py \
--manifest-dir ./data/vq_fbank \
--master-port 12359 \
--full-libri $full_libri \
--spec-aug-time-warp-factor -1 \
--max-duration 300 \
--world-size ${WORLD_SIZE} \
--num-epochs 20 \
--exp-dir $exp_dir \
--enable-distillation True
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
# Results should be similar to:
# errs-test-clean-beam_size_4-epoch-20-avg-10-beam-4.txt:%WER = 5.67
# errs-test-other-beam_size_4-epoch-20-avg-10-beam-4.txt:%WER = 15.60
./pruned_transducer_stateless6/decode.py \
--decoding-method "modified_beam_search" \
--epoch 20 \
--avg 10 \
--max-duration 200 \
--exp-dir $exp_dir \
--enable-distillation True
fi