Skip to content

Commit

Permalink
add device option in wespeaker cli; set onnxruntime log level (#351)
Browse files Browse the repository at this point in the history
* add device option to wespeaker cli, for example, --device mps for Metal (MacOS) and --device cuda for CUDA (Windows and Linux); set onnxruntime log level to 1 in extract_emb.py

* update  related docs
  • Loading branch information
xx205 authored Aug 23, 2024
1 parent 5ac089e commit 91aceec
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 17 deletions.
10 changes: 6 additions & 4 deletions docs/python_package.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ $ wespeaker --task embedding --audio_file audio.wav --output_file embedding.txt
$ wespeaker --task embedding_kaldi --wav_scp wav.scp --output_file /path/to/embedding
$ wespeaker --task similarity --audio_file audio.wav --audio_file2 audio2.wav
$ wespeaker --task diarization --audio_file audio.wav
$ wespeaker --task diarization --audio_file audio.wav --device cuda:0 # use CUDA on Windows/Linux
$ wespeaker --task diarization --audio_file audio.wav --device mps # use Metal Performance Shaders on MacOS
```

You can specify the following parameters. (use `-h` for details)
Expand All @@ -33,7 +35,7 @@ You can specify the following parameters. (use `-h` for details)
- diarization_list: apply speaker diarization for a kaldi-style wav.scp
* `-l` or `--language`: use Chinese/English speaker models
* `-p` or `--pretrain`: the path of pretrained model, `avg_model.pt` and `config.yaml` should be contained
* `-g` or `--gpu`: use GPU for inference, number $< 0$ means using CPU
* `--device`: set pytorch device, `cpu`, `cuda`, `cuda:0` or `mps`
* `--campplus`:
use [`campplus_cn_common_200k` of damo](https://www.modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary)
* `--eres2net`:
Expand Down Expand Up @@ -69,14 +71,14 @@ which can either be the ones we provided and trained by yourself.
import wespeaker

model = wespeaker.load_model('chinese')
# set_gpu to enable the cuda inference, number < 0 means using CPU
model.set_gpu(0)
# set the device on which tensors are or will be allocated.
model.set_device('cuda:0')

# embedding/embedding_kaldi/similarity/diarization
embedding = model.extract_embedding('audio.wav')
utt_names, embeddings = model.extract_embedding_list('wav.scp')
similarity = model.compute_similarity('audio1.wav', 'audio2.wav')
diar_result = model.diarize('audio.wav')
diar_result = model.diarize('audio.wav', 'give_this_utt_a_name')

# register and recognize
model.register('spk1', 'spk1_audio1.wav')
Expand Down
8 changes: 2 additions & 6 deletions wespeaker/cli/speaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,7 @@ def set_resample_rate(self, resample_rate: int):
def set_vad(self, apply_vad: bool):
self.apply_vad = apply_vad

def set_gpu(self, device_id: int):
if device_id >= 0:
device = 'cuda:{}'.format(device_id)
else:
device = 'cpu'
def set_device(self, device: str):
self.device = torch.device(device)
self.model = self.model.to(self.device)

Expand Down Expand Up @@ -304,7 +300,7 @@ def main():
model = load_model_local(args.pretrain)
model.set_resample_rate(args.resample_rate)
model.set_vad(args.vad)
model.set_gpu(args.gpu)
model.set_device(args.device)
model.set_diarization_params(min_duration=args.diar_min_duration,
window_secs=args.diar_window_secs,
period_secs=args.diar_period_secs,
Expand Down
11 changes: 6 additions & 5 deletions wespeaker/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@ def get_args():
type=str,
default="",
help='model directory')
parser.add_argument('-g',
'--gpu',
type=int,
default=-1,
help='which gpu to use (number <0 means using cpu)')
parser.add_argument('--device',
type=str,
default='cpu',
help="device type (most commonly cpu or cuda,"
"but also potentially mps, xpu, xla or meta)"
"and optional device ordinal for the device type.")
parser.add_argument('--audio_file', help='audio file')
parser.add_argument('--audio_file2',
help='audio file2, specifically for similarity task')
Expand Down
2 changes: 1 addition & 1 deletion wespeaker/diar/extract_emb.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def init_session(source, device):
opts = ort.SessionOptions()
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1
opts.log_severity_level = 0
opts.log_severity_level = 1
session = ort.InferenceSession(source,
sess_options=opts,
providers=providers)
Expand Down
2 changes: 1 addition & 1 deletion wespeaker/frontend/s3prl.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(self,

if layer != -1:
layer_selections = [layer]
assert not multilayer_feature,\
assert not multilayer_feature, \
"multilayer_feature must be False if layer is specified"
else:
layer_selections = None
Expand Down

0 comments on commit 91aceec

Please sign in to comment.