forked from josepatino/pyBK
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config_DIHARD.ini
130 lines (115 loc) · 5.29 KB
/
config_DIHARD.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# AUTHORS
# Jose PATINO, EURECOM, Sophia-Antipolis, France, 2019
# http://www.eurecom.fr/en/people/patino-jose
# Contact: patino[at]eurecom[dot]fr, josempatinovillar[at]gmail[dot]com
# SYSTEM CONFIGURATION
# For INI config formatting please refer to https://docs.python.org/3/library/configparser.html
[EXPERIMENT]
name = DIHARD
[GENERAL]
# Format of speech activity detection (SAD) files (LBL, MDTM or RTTM)
# Current version does not provide with automatic SAD
SADformat = LBL
# Set to 0 if you have previous VAD files, 1 if you do not have them or want to do automatic VAD in any case
# Note that access to audio files is necessary to perform VAD
performVAD = 0
performFeatureExtraction = 1
[PATH]
# Path to the respective necessary files
# Audio files, necessary if performFeatureExtraction=1
audio = ./path_to_DIHARD_audio/
#Features in HTK format, necessary if performFeatureExtraction=0
features = ./path_to_DIHARD_features/
# UEM files indicate the audio part to be considered in an audio file basis. The system expects a .uem file with the same name as its audio file
UEM = ./uem/
# SAD files indicate the speech part to be considered in an audio file basis. The system expects a .lbl/.mdtm/.rttm file with the same name as its audio file
SAD = ./path_to_DIHARD_sad/
# Diarization output folder will contain the concatenated files diarization outputs in .mdtm/.rttm format
output = ./out/
[EXTENSION]
# Audio reading relies on the librosa library
# For format options please refer to https://librosa.github.io/librosa/
audio = .flac
UEM = .uem
SAD = .lab
output = .rttm
[FEATURES]
### NOTE: for our DIHARD participation we used IIR-CQT Mel-frequency cepstral coefficients (ICMC) with a different configuration than that described here.
### MATLAB code to replicate them is available at http://audio.eurecom.fr/software/
# Features (Mel-Frequency Cepstral Coefficients (MFCCs)) are extracted using the librosa library
# For details and optins please refer to https://librosa.github.io/librosa/
# Window length for featur extraction in ms
framelength = 0.025
# Window shift for feature extraction in ms
# Note that VAD is assumed to have the same resolution as feature extraction, if you need something more specific you can bypass this easily
frameshift = 0.01
# Number of mel filters used
nfilters = 20
# Number of MFCCs employed
ncoeff = 19
[KBM]
# This section configures the parameters of the binary key background model (KBM) which will be trained on the MFCCs from each respective audio file
# Minimum number of Gaussians in the initial pool
minimumNumberOfInitialGaussians = 1024
# Maximum window rate for Gaussian computation
maximumKBMWindowRate = 50
# Window length for computing Gaussians
windowLength = 200
# Number of final Gaussian components in the KBM
kbmSize = 320
# If set to 1, the KBM size is set as a proportion, given by "relKBMsize", of the pool size
useRelativeKBMsize = 1
# Relative KBM size if "useRelativeKBMsize = 1" (value between 0 and 1).
relKBMsize = 0.1
[SEGMENT]
# This section configures the frames of features on which the binary keys/cumulative vectors are extracted
# Window size in frames
length = 100
# Window increment after and before window in frames
increment = 100
# Window shifting in frames
rate = 100
[BINARY_KEY]
# This section configures the parameters used for the binary keys/cumulative vectors extraction
# Number of top selected components per frame
topGaussiansPerFrame = 5
# Percentage of bits set to 1 in the binary keys
bitsPerSegmentFactor = 0.2
[CLUSTERING]
# This section configures the parameters used for the binary keys/cumulative vectors extraction
# Number of initial clusters
N_init = 25
# Set to one to perform linkage clustering instead of clustering/reassignment
linkage = 0
# Linkage criterion used if linkage==1 ('average', 'single', 'complete')
linkageCriterion = average
# Similarity metric: 'cosine' for cumulative vectors, and 'jaccard' for binary keys
metric = cosine
[CLUSTERING_SELECTION]
# This section configures the parameters used by the AHC algorithm
# Distance metric used in the selection of the output clustering solution ('jaccard','cosine')
metric_clusteringSelection = cosine
# Method employed for number of clusters selection. Can be either 'elbow' for an elbow criterion based on within-class sum of squares (WCSS) or 'spectral' for spectral clustering
bestClusteringCriterion = spectral
# Spectral clustering parameters, employed if bestClusteringCriterion == spectral
sigma = 1
percentile = 40
# If known, max nr of speakers in a sesssion in the database.
# This is to limit the effect of changes in very small meaningless eigenvalues values generating huge eigengaps
maxNrSpeakers = 11
[RESEGMENTATION]
# This section configures the parameters used by the Gaussian mixture model (GMM) based maximum likelihood assignment resegmentation algorithm
# Set to 1 to perform re-segmentation
resegmentation = 1
# Number of GMM components
modelSize =128
# Number of expectation-maximization (EM) iterations
nbIter = 10
# Size of the likelihood smoothing window in nb of frames
smoothWin = 100
[OUTPUT]
# 'MDTM' or 'RTTM'
format = RTTM
# If 0, all the segmentation outputs are stored in a single output file
# If 1, all partial clustering solutions obtained at every iteration are stored in separated files. Useful for debugging and system tuning purposes
returnAllPartialSolutions = 0