From b5da60d9bfb757fb6d4402288ad3e5d7a1eabadb Mon Sep 17 00:00:00 2001
From: "Rohdin Johan A." <rohdin@fit.vutbr.cz>
Date: Mon, 26 Aug 2024 15:52:53 +0200
Subject: [PATCH] Some yapf fixes

---
 examples/sre/v3/README.md               |  2 +-
 tools/copy_data_dir.sh                  |  6 ++--
 wespeaker/bin/apply_embd_proc.py        | 27 +++++++++++------
 wespeaker/bin/prep_embd_proc.py         |  7 +++--
 wespeaker/utils/embedding_processing.py | 39 ++++++++++++-------------
 5 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/examples/sre/v3/README.md b/examples/sre/v3/README.md
index 32fc0add..82f1ca35 100644
--- a/examples/sre/v3/README.md
+++ b/examples/sre/v3/README.md
@@ -7,7 +7,7 @@
 Similarly to ../v2, this recipe uses silero vad https://github.com/snakers4/silero-vad
 downloaded from here https://github.com/snakers4/silero-vad/archive/refs/tags/v4.0.zip
 If you intended to use this recipe for an evaluation/competition, make sure to check that
-it is allowed to use the data used to train Silero.
+it is allowed to use the data that has been used to train Silero.
 
 ### Instructions
 * Set the paths in stage 1. The variable ```sre_data_dir``` is assumed to be prepared by
diff --git a/tools/copy_data_dir.sh b/tools/copy_data_dir.sh
index 9f781242..c4cd4db6 100755
--- a/tools/copy_data_dir.sh
+++ b/tools/copy_data_dir.sh
@@ -51,7 +51,7 @@ else
         awk 'NR==FNR{a[$1];next}$1 in a{print $0}' $utt_list $src_dir/utt2spk > $dest_dir/utt2spk
     elif [ ! -z "$spk_list" ];then
         #echo "A"
-        awk 'NR==FNR{a[$1];next}$2 in a{print $0}' $spk_list $src_dir/utt2spk > $dest_dir/utt2spk 
+        awk 'NR==FNR{a[$1];next}$2 in a{print $0}' $spk_list $src_dir/utt2spk > $dest_dir/utt2spk
     else
         cp $src_dir/utt2spk $dest_dir/utt2spk
     fi
@@ -66,12 +66,12 @@ else
         cat $scrdir/spk2utt | tools/spk2utt_to_utt2spk.pl \
             | awk 'NR==FNR{a[$1];next}$1 in a{print $0}' $utt_list - \
             | tools/utt2spk_to_spk2utt.pl > $dest_dir/spk2utt
-        
+
     elif [ ! -z "$spk_list" ];then
         awk 'NR==FNR{a[$1];next}$1 in a{print $0}' $spk_list $src_dir/spk2utt > $dest_dir/spk2utt
     else
         cp $src_dir/spk2utt $dest_dir/spk2utt
-    fi        
+    fi
 fi
 
 
diff --git a/wespeaker/bin/apply_embd_proc.py b/wespeaker/bin/apply_embd_proc.py
index 37f7a5af..a7149ed4 100644
--- a/wespeaker/bin/apply_embd_proc.py
+++ b/wespeaker/bin/apply_embd_proc.py
@@ -22,11 +22,17 @@
     xxx
     """
     parser = argparse.ArgumentParser()
-    parser.add_argument('--path', type=str, default='', 
+    parser.add_argument('--path',
+                        type=str,
+                        default='',
                         help='Path to processing chain.')
-    parser.add_argument('--input', type=str, default='', 
+    parser.add_argument('--input',
+                        type=str,
+                        default='',
                         help='Input scp file.')
-    parser.add_argument('--output', type=str, default='', 
+    parser.add_argument('--output',
+                        type=str,
+                        default='',
                         help='Output scp/ark file.')
     args = parser.parse_args()
 
@@ -34,20 +40,21 @@
     processingChain.load(args.path)
 
     embd = []
-    utt = [] 
+    utt = []
     for k, v in kaldiio.load_scp_sequential(args.input):
-        utt.append(k) 
+        utt.append(k)
         embd.append(v)
     embd = np.array(embd)
     utt = np.array(utt)
 
-    print("Read {} embeddings of dimension {}.".format(embd.shape[0], embd.shape[1]))
+    print("Read {} embeddings of dimension {}.".format(embd.shape[0],
+                                                       embd.shape[1]))
 
     embd = processingChain(embd)
 
     # Store both ark and scp if extention '.ark,scp' or '.scp,ark'. Or, only
     # ark if extension is '.ark'
-    output_file = args.output 
+    output_file = args.output
     if output_file.endswith('ark,scp') or output_file.endswith('scp,ark'):
         output_file = output_file.rstrip('ark,scp')
         output_file = output_file.rstrip('scp,ark')
@@ -63,6 +70,8 @@
                 e = embd[i]
                 writer(u, e)
     else:
-        raise Exception("Invalid file extension of output file {}".format(output_file))
+        raise Exception(
+            "Invalid file extension of output file {}".format(output_file))
 
-    print("Wrote {} embeddings of dimension {}.".format(embd.shape[0], embd.shape[1]))
+    print("Wrote {} embeddings of dimension {}.".format(
+        embd.shape[0], embd.shape[1]))
diff --git a/wespeaker/bin/prep_embd_proc.py b/wespeaker/bin/prep_embd_proc.py
index e592d30b..445eab08 100644
--- a/wespeaker/bin/prep_embd_proc.py
+++ b/wespeaker/bin/prep_embd_proc.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import argparse
 
 from wespeaker.utils.embedding_processing import EmbeddingProcessingChain
@@ -22,8 +21,10 @@
     xxx
     """
     parser = argparse.ArgumentParser()
-    parser.add_argument('--chain', type=str, 
-                        default='whitening | length-norm ', help='')
+    parser.add_argument('--chain',
+                        type=str,
+                        default='whitening | length-norm ',
+                        help='')
     parser.add_argument('--path', type=str)
     args = parser.parse_args()
 
diff --git a/wespeaker/utils/embedding_processing.py b/wespeaker/utils/embedding_processing.py
index 178a4c17..7595ffac 100644
--- a/wespeaker/utils/embedding_processing.py
+++ b/wespeaker/utils/embedding_processing.py
@@ -21,13 +21,13 @@
 
 
 def chain_string_to_dict(chain_string=None):
-    # This function converts an input string into a list and dictionary 
-    # structure suitable for use by the embedding processing classes below. 
+    # This function converts an input string into a list and dictionary
+    # structure suitable for use by the embedding processing classes below.
     # For example,
-    #     "mean-subtract --scp mean1_xvector.scp | length-norm " | 
+    #     "mean-subtract --scp mean1_xvector.scp | length-norm " |
     #     "| lda  --scp lda_xvector.scp --utt2spk utt2spk --dim $lda_dim "
     #     "| length-norm"
-    # (The above three lines is supposed to be one long string but style 
+    # (The above three lines is supposed to be one long string but style
     # rules prevents it from be written that way here.)
     # becomes
     # [
@@ -74,8 +74,8 @@ def compute_mean_and_lda_scatter_matrices(self,
                                               utt2spk_file,
                                               equal_speaker_weight=False,
                                               current_chain=None):
-        # equal_speaker_weight: If True, each speaker is considered equally 
-        # important in the calculation of the mean and scatter matrices. If 
+        # equal_speaker_weight: If True, each speaker is considered equally
+        # important in the calculation of the mean and scatter matrices. If
         # False, speakers are weighted by their number of utterances.
         if current_chain is None:
             current_chain = []
@@ -89,7 +89,7 @@ def compute_mean_and_lda_scatter_matrices(self,
         for s in speakers:
             embd_s = current_chain(np.vstack(embeddings_dict[s]))
             count_s = embd_s.shape[0]
-            # With bias=False we need at least 2 speakers, with bias=True we 
+            # With bias=False we need at least 2 speakers, with bias=True we
             # need at least 1. But this would result in covariance matrix = 0
             # for all its elements. (This is not necessarily wrong).
             if count_s > 1:
@@ -144,12 +144,12 @@ def __init__(self, args, current_chain=None):
             scp_file, utt2spk_file, current_chain=current_chain)
 
         E, M = spl.eigh(WC)
-        # Floor the within-class covariance eigenvalues. We noticed that this 
+        # Floor the within-class covariance eigenvalues. We noticed that this
         # was done in Kaldi.
         E_floor = np.max(E) * eps
         E[E < E_floor] = E_floor
-        """ 
-        # The new within-class covariance.        
+        """
+        # The new within-class covariance.
         WC       = M.dot(np.diag(E).dot(M.T))
         D, lda   = spl.eigh( BC, WC )         # The output of eigh is sorted in
         self.lda = lda[:,-dim:]               # ascending order so we so we kee
@@ -157,10 +157,10 @@ def __init__(self, args, current_chain=None):
         """
         # Since we have already found the eigen decomposition of WC, we could
         # whiten it by T1 = 1 / sqrt(E), I = T1 WC T1'. So instead of solving
-        # spl.eigh( BC, WC ) we can apply T1 on BC and solve 
-        # spl.eigh( T1 BC T1', T1 WC T1' ) 
-        #  = spl.eigh( T1 BC T1', I ) 
-        #  = spl.eigh( T1 BC T1') 
+        # spl.eigh( BC, WC ) we can apply T1 on BC and solve
+        # spl.eigh( T1 BC T1', T1 WC T1' )
+        #  = spl.eigh( T1 BC T1', I )
+        #  = spl.eigh( T1 BC T1')
         # as follows. However, T1 then needs to be inlcluded when transforming
         # the data. In either case, the result is that after LDA transform, the
         # data will have white WC and diagonal BC
@@ -169,10 +169,9 @@ def __init__(self, args, current_chain=None):
         D, lda = spl.eigh(BC)
         self.lda = np.dot(T1.T, lda[:, -dim:])
 
-        print(
-            "  Input dimension: {}, output dimension: {},"
-            " sum of all eigenvalues {:.2f}, sum of kept eigenvalues {:.2f}"
-            .format(len(D), dim, np.sum(D), np.sum(D[-dim:])))
+        print("  Input dimension: {}, output dimension: {},"
+              " sum of all eigenvalues {:.2f}, sum of kept eigenvalues {:.2f}".
+              format(len(D), dim, np.sum(D), np.sum(D[-dim:])))
         print("  All eigenvalues: {}".format(D))
 
     def __call__(self, embd):
@@ -189,9 +188,9 @@ def __call__(self, embd):
         embd_proc /= np.sqrt((embd_proc**2).sum(
             axis=1)[:, np.newaxis])  # This would make the lengths equal to one
         """
-        Todo: For Kaldi compatibility we may want to add this as option as 
+        Todo: For Kaldi compatibility we may want to add this as option as
         well as Kaldi style normalization.
-        embd_proc   *= np.sqrt(embd_normed.shape[1])       
+        embd_proc   *= np.sqrt(embd_normed.shape[1])
         """
         return (embd_proc)