Skip to content

Commit

Permalink
Add a function extract_embedding_from_pcm such that we can easily e…
Browse files Browse the repository at this point in the history
…xtracting speaker embedding from part of a long wav file, as well as other formats than wav. (#361)
  • Loading branch information
wq2012 authored Sep 15, 2024
1 parent 7ef894a commit 3ccc791
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion wespeaker/cli/speaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,22 @@ def extract_embedding_feats(self, fbanks, batch_size, subseg_cmn):
def extract_embedding(self, audio_path: str):
pcm, sample_rate = torchaudio.load(audio_path,
normalize=self.wavform_norm)
return self.extract_embedding_from_pcm(pcm, sample_rate)

def extract_embedding_from_pcm(self, pcm: torch.Tensor, sample_rate: int):
if self.apply_vad:
# TODO(Binbin Zhang): Refine the segments logic, here we just
# suppose there is only silence at the start/end of the speech
wav = read_audio(audio_path)
vad_sample_rate = 16000
wav = pcm
if wav.size(0) > 1:
wav = wav.mean(dim=0, keepdim=True)

if sample_rate != vad_sample_rate:
transform = torchaudio.transforms.Resample(
orig_freq=sample_rate,
new_freq=vad_sample_rate)
wav = transform(wav)
segments = get_speech_timestamps(wav, self.vad, return_seconds=True)
pcmTotal = torch.Tensor()
if len(segments) > 0: # remove all the silence
Expand Down

0 comments on commit 3ccc791

Please sign in to comment.