wenet-e2e · JiJiJiang · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/wespeaker/models/campplus.py b/wespeaker/models/campplus.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2023 Hongji Wang ([email protected])
+#               2024 Zhengyang Chen ([email protected])
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -394,6 +395,17 @@ def __init__(self,
                 if m.bias is not None:
                     nn.init.zeros_(m.bias)
 
+    def get_frame_level_feat(self, x):
+        # for outer interface
+        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        x = self.head(x)
+        for layer in self.xvector[:-2]:
+            x = layer(x)
+
+        out = x.permute(0, 2, 1)
+
+        return out  # (B, T, D)
+
     def forward(self, x):
         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
         x = self.head(x)

diff --git a/wespeaker/models/ecapa_tdnn.py b/wespeaker/models/ecapa_tdnn.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2021 Zhengyang Chen ([email protected])
 #               2022 Hongji Wang ([email protected])
 #               2023 Bing Han ([email protected])
+#               2024 Zhengyang Chen ([email protected])
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -204,7 +205,8 @@ def __init__(self,
         else:
             self.bn2 = nn.Identity()
 
-    def forward(self, x):
+    def __get_frame_level_feat(self, x):
+        # for inner class usage
         x = x.permute(0, 2, 1)  # (B,T,F) -> (B,F,T)
 
         out1 = self.layer1(x)
@@ -213,7 +215,17 @@ def forward(self, x):
         out4 = self.layer4(out3)
 
         out = torch.cat([out2, out3, out4], dim=1)
-        out = F.relu(self.conv(out))
+        out = self.conv(out)
+
+        return out
+
+    def get_frame_level_feat(self, x):
+        # for outer interface
+        out = self.__get_frame_level_feat(x).permute(0, 2, 1)
+        return out  # (B, T, D)
+
+    def forward(self, x):
+        out = F.relu(self.__get_frame_level_feat(x))
         out = self.bn(self.pool(out))
         out = self.linear(out)
         if self.emb_bn:

diff --git a/wespeaker/models/eres2net.py b/wespeaker/models/eres2net.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2024 Hongji Wang ([email protected])
+#               2024 Zhengyang Chen ([email protected])
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -350,7 +351,8 @@ def _make_layer(self,
             self.in_planes = planes * self.expansion
         return nn.Sequential(*layers)
 
-    def forward(self, x):
+    def __get_frame_level_feat(self, x):
+        # for inner class usage
         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
         x = x.unsqueeze_(1)
         out = F.relu(self.bn1(self.conv1(x)))
@@ -364,6 +366,19 @@ def forward(self, x):
         out4 = self.layer4(out3)
         fuse_out123_downsample = self.layer3_downsample(fuse_out123)
         fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
+
+        return fuse_out1234
+
+    def get_frame_level_feat(self, x):
+        # for outer interface
+        out = self.__get_frame_level_feat(x)
+        out = out.transpose(1, 3)
+        out = torch.flatten(out, 2, -1)
+
+        return out  # (B, T, D)
+
+    def forward(self, x):
+        fuse_out1234 = self.__get_frame_level_feat(x)
         stats = self.pool(fuse_out1234)
 
         embed_a = self.seg_1(stats)

diff --git a/wespeaker/models/gemini_dfresnet.py b/wespeaker/models/gemini_dfresnet.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024 Shuai Wang ([email protected])
 #               2024 Tianchi Liu ([email protected])
+#               2024 Zhengyang Chen ([email protected])
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -101,7 +102,8 @@ def __init__(self,
             self.seg_bn_1 = nn.Identity()
             self.seg_2 = nn.Identity()
 
-    def forward(self, x):
+    def __get_frame_level_feat(self, x):
+        # for inner class usage
         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
         x = x.unsqueeze_(1)
         out = self.downsample_layers[0](x)
@@ -114,6 +116,19 @@ def forward(self, x):
         out = self.downsample_layers[4](out)
         out = self.stages[3](out)
 
+        return out
+
+    def get_frame_level_feat(self, x):
+        # for outer interface 
+        out = self.__get_frame_level_feat(x)
+        out = out.transpose(1, 3)
+        out = torch.flatten(out, 2, -1)
+
+        return out  # (B, T, D)
+
+    def forward(self, x):
+
+        out = self.__get_frame_level_feat(x)
         stats = self.pool(out)
 
         embed_a = self.seg_1(stats)