From 0f7f3b4c35f1be670e8998f5168a612e915e523c Mon Sep 17 00:00:00 2001 From: Siwei Yang Date: Thu, 21 Nov 2024 18:30:44 -0800 Subject: [PATCH 1/3] add VoxCeleb2Scenario for gender classification --- .../presentation/run_entries_speech.conf | 1 + .../benchmark/run_specs/audio_run_specs.py | 21 +++++ .../audio_language/voxceleb2_scenario.py | 87 +++++++++++++++++++ src/helm/benchmark/static/schema_speech.yaml | 22 +++++ 4 files changed, 131 insertions(+) create mode 100644 src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf index 68276b629a..e89a61208d 100644 --- a/src/helm/benchmark/presentation/run_entries_speech.conf +++ b/src/helm/benchmark/presentation/run_entries_speech.conf @@ -5,6 +5,7 @@ entries: [ {description: "meld_audio:model=audiolm", priority: 1} {description: "vocal_sound:model=audiolm", priority: 1} {description: "audiocaps:model=audiolm", priority: 1} + {description: "voxceleb2:model=audiolm", priority: 1} #################################################################################################################### # Fairness diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py index ccccb1a08d..5b36c1569a 100644 --- a/src/helm/benchmark/run_specs/audio_run_specs.py +++ b/src/helm/benchmark/run_specs/audio_run_specs.py @@ -266,6 +266,27 @@ def get_audiocaps_run_spec() -> RunSpec: ) +@run_spec_function("voxceleb2") +def get_voxceleb2_run_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario" + ) + adapter_spec = _get_generation_adapter_spec( + instructions="Determine the gender of the following audio's speaker. Reply only with 'Male' or 'Female'. " + "Don't include any explanation", + max_tokens=50, + ) + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + run_spec_name: str = "voxceleb2" + return RunSpec( + name=run_spec_name, + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=[run_spec_name], + ) + + @run_spec_function("common_voice_15") def get_common_voice_15_run_spec(language: str) -> RunSpec: scenario_spec = ScenarioSpec( diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py new file mode 100644 index 0000000000..f85fb0ccb7 --- /dev/null +++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py @@ -0,0 +1,87 @@ +from typing import List +import os + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) +from tqdm import tqdm +from helm.common.media_object import MediaObject, MultimediaObject +from helm.common.general import ensure_file_downloaded +import pandas as pd +from glob import glob +from pydub import AudioSegment +from multiprocessing import Pool + + +def _m4a_to_wav(input_path, output_path): + audio = AudioSegment.from_file(input_path, format="m4a") + audio.export(output_path, format="wav") + + +def _process_single_sample(audio_path_gender_pair): + audio_path, gender = audio_path_gender_pair + audio_path_wav = audio_path[:-3] + "wav" + _m4a_to_wav(audio_path, audio_path_wav) + input = Input( + multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path_wav)]) + ) + references = [Reference(Output(text=gender), tags=[CORRECT_TAG])] + return Instance(input=input, references=references, split=TEST_SPLIT) + + +class VoxCeleb2Scenario(Scenario): + """VoxCeleb2 + + VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from + interview videos uploaded to YouTube. + + Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf + + Citation: + @inproceedings{Chung18b, + author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.", + title = "VoxCeleb2: Deep Speaker Recognition", + booktitle = "INTERSPEECH", + year = "2018", + } + """ + + DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip" + REFERENCE_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_meta.csv" + + name = "voxceleb2" + description = "A large-scale dataset of about 46K audio clips to human-written text pairs \ + ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))." + tags: List[str] = ["audio", "classification"] + + def get_instances(self, output_path: str) -> List[Instance]: + instances: List[Instance] = [] + data_root = os.path.join(output_path, "data/test/aac") + ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=data_root, unpack=True) + df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=" ,") + df = df[df["Set"] == "test"] + df = df[df["VoxCeleb2 ID"].apply(lambda x: x not in ["id04170", "id05348"])] + + audio_path_gender_pairs = [] + + for _, row in tqdm(df.iterrows(), total=len(df)): + vox_celeb2_id = row["VoxCeleb2 ID"] + gender = "Male" if row["Gender"] == "m" else "Female" + audio_dir = os.path.join(data_root, vox_celeb2_id) + assert os.path.exists(audio_dir), f"Audio file does not exist at path: {audio_dir}" + + audio_paths = glob(os.path.join(audio_dir, "**/*.m4a"), recursive=True) + audio_paths = sorted(audio_paths) + + audio_path_gender_pairs += [(audio_path, gender) for audio_path in audio_paths] + + with Pool(processes=4) as pool: + instances = pool.map(_process_single_sample, audio_path_gender_pairs) + + return instances diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml index 93a71e1bd2..226fe196b9 100644 --- a/src/helm/benchmark/static/schema_speech.yaml +++ b/src/helm/benchmark/static/schema_speech.yaml @@ -320,6 +320,28 @@ run_groups: when: "2019" language: English + - name: voxceleb2 + display_name: VoxCeleb2 + description: > + AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected + via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds, + musical instruments and genres, and common everyday environmental sounds. + ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)). + VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from + interview videos uploaded to YouTube.([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf)) + metric_groups: + - accuracy + - general_information + environment: + main_name: cider + main_split: test + taxonomy: + task: audio gender classification + what: audio clips in the wild + who: real speakers + when: "2018" + language: 145 languages + - name: common_voice_15 display_name: Common Voice 15 description: > From 4ab0c85d136482e47df0134f28e4df393ac46ace Mon Sep 17 00:00:00 2001 From: Siwei Yang Date: Thu, 21 Nov 2024 19:55:29 -0800 Subject: [PATCH 2/3] change VoxCeleb2Scenario to a audio identification task --- .../benchmark/run_specs/audio_run_specs.py | 4 +- .../audio_language/voxceleb2_scenario.py | 63 ++++++++++--------- src/helm/benchmark/static/schema_speech.yaml | 2 +- 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py index 5b36c1569a..66a424a0e7 100644 --- a/src/helm/benchmark/run_specs/audio_run_specs.py +++ b/src/helm/benchmark/run_specs/audio_run_specs.py @@ -272,8 +272,8 @@ def get_voxceleb2_run_spec() -> RunSpec: class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario" ) adapter_spec = _get_generation_adapter_spec( - instructions="Determine the gender of the following audio's speaker. Reply only with 'Male' or 'Female'. " - "Don't include any explanation", + instructions="Determine whether the speakers in the following two audio clips are the same person. " + "Reply only with 'True' or 'False'. Don't include any explanation", max_tokens=50, ) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py index f85fb0ccb7..6710b277ff 100644 --- a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py @@ -1,5 +1,6 @@ from typing import List import os +import os.path as osp from helm.benchmark.scenarios.scenario import ( Scenario, @@ -24,26 +25,22 @@ def _m4a_to_wav(input_path, output_path): audio.export(output_path, format="wav") -def _process_single_sample(audio_path_gender_pair): - audio_path, gender = audio_path_gender_pair +def _preprocess_single_sample(audio_path): + assert osp.exists(audio_path), f"Audio file does not exist at path: {audio_path}" audio_path_wav = audio_path[:-3] + "wav" - _m4a_to_wav(audio_path, audio_path_wav) - input = Input( - multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path_wav)]) - ) - references = [Reference(Output(text=gender), tags=[CORRECT_TAG])] - return Instance(input=input, references=references, split=TEST_SPLIT) + audio = AudioSegment.from_file(audio_path, format="m4a") + audio.export(audio_path_wav, format="wav") class VoxCeleb2Scenario(Scenario): """VoxCeleb2 - VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from + VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube. Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf - Citation: + Citation: @inproceedings{Chung18b, author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.", title = "VoxCeleb2: Deep Speaker Recognition", @@ -53,35 +50,45 @@ class VoxCeleb2Scenario(Scenario): """ DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip" - REFERENCE_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_meta.csv" + REFERENCE_URL = ( + "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv" + ) name = "voxceleb2" description = "A large-scale dataset of about 46K audio clips to human-written text pairs \ ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))." - tags: List[str] = ["audio", "classification"] + tags: List[str] = ["audio", "identification"] def get_instances(self, output_path: str) -> List[Instance]: instances: List[Instance] = [] - data_root = os.path.join(output_path, "data/test/aac") + data_root = osp.join(output_path, "data/test/aac") ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=data_root, unpack=True) - df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=" ,") - df = df[df["Set"] == "test"] - df = df[df["VoxCeleb2 ID"].apply(lambda x: x not in ["id04170", "id05348"])] + df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",") - audio_path_gender_pairs = [] - - for _, row in tqdm(df.iterrows(), total=len(df)): - vox_celeb2_id = row["VoxCeleb2 ID"] - gender = "Male" if row["Gender"] == "m" else "Female" - audio_dir = os.path.join(data_root, vox_celeb2_id) - assert os.path.exists(audio_dir), f"Audio file does not exist at path: {audio_dir}" + df["first"] = df["first"].apply(lambda x: osp.join(data_root, x)) + df["second"] = df["second"].apply(lambda x: osp.join(data_root, x)) - audio_paths = glob(os.path.join(audio_dir, "**/*.m4a"), recursive=True) - audio_paths = sorted(audio_paths) + all_paths = set(df["first"].to_list() + df["second"].to_list()) + with Pool(processes=4) as pool: + list(tqdm(pool.imap(_preprocess_single_sample, all_paths), total=len(all_paths))) - audio_path_gender_pairs += [(audio_path, gender) for audio_path in audio_paths] + instances = [] - with Pool(processes=4) as pool: - instances = pool.map(_process_single_sample, audio_path_gender_pairs) + for _, row in tqdm(df.iterrows(), total=len(df)): + first = row["first"][:-3] + "wav" + second = row["second"][:-3] + "wav" + same = "True" if row["same"] else "False" + + input = Input( + multimedia_content=MultimediaObject( + [ + MediaObject(content_type="audio/wav", location=first), + MediaObject(content_type="audio/wav", location=second), + ] + ) + ) + + references = [Reference(Output(text=same), tags=[CORRECT_TAG])] + instances.append(Instance(input=input, references=references, split=TEST_SPLIT)) return instances diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml index 226fe196b9..d1617ea524 100644 --- a/src/helm/benchmark/static/schema_speech.yaml +++ b/src/helm/benchmark/static/schema_speech.yaml @@ -336,7 +336,7 @@ run_groups: main_name: cider main_split: test taxonomy: - task: audio gender classification + task: audio identification what: audio clips in the wild who: real speakers when: "2018" From dcd10c583ef3b59e47070e6c757a742d14f4f038 Mon Sep 17 00:00:00 2001 From: ImKeTT Date: Fri, 22 Nov 2024 15:13:18 -0800 Subject: [PATCH 3/3] fix --- .../benchmark/run_specs/audio_run_specs.py | 6 +- .../casual_conversations2_scenario.py | 10 +- .../audio_language/voxceleb2_scenario.py | 91 +++++++++++-------- src/helm/benchmark/static/schema_speech.yaml | 13 ++- src/helm/common/audio_utils.py | 20 ++++ 5 files changed, 81 insertions(+), 59 deletions(-) diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py index 66a424a0e7..97fcddd55f 100644 --- a/src/helm/benchmark/run_specs/audio_run_specs.py +++ b/src/helm/benchmark/run_specs/audio_run_specs.py @@ -271,10 +271,8 @@ def get_voxceleb2_run_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario" ) - adapter_spec = _get_generation_adapter_spec( - instructions="Determine whether the speakers in the following two audio clips are the same person. " - "Reply only with 'True' or 'False'. Don't include any explanation", - max_tokens=50, + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( + input_noun=None, output_noun="Answer", max_train_instances=0 ) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() run_spec_name: str = "voxceleb2" diff --git a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py index 93c9cd812c..783c9c9fa9 100644 --- a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +++ b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py @@ -15,6 +15,7 @@ ) from helm.common.media_object import MediaObject, MultimediaObject from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from helm.common.audio_utils import use_ffmpeg_to_extract_audio_from_video class CasualConversations2Scenario(Scenario): @@ -72,12 +73,6 @@ def __init__(self, subject: str) -> None: self.options = self.age_options if subject == "age" else self.gender_options self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION - def _extract_audio_from_video(self, input_video_path: str, output_audio_path: str) -> None: - try: - os.system(f"ffmpeg -i {input_video_path} -q:a 0 -map a {output_audio_path}") - except Exception: - raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.") - def _convert_age_to_label(self, age: str) -> str: if age != "prefer not to say": age_int = int(age) @@ -128,8 +123,7 @@ def get_instances(self, output_path: str) -> List[Instance]: if file_name.endswith(".mp4"): local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3")) local_video_path: str = os.path.join(data_dir, file_name) - if not os.path.exists(local_audio_path): - self._extract_audio_from_video(local_video_path, local_audio_path) + use_ffmpeg_to_extract_audio_from_video(local_video_path, local_audio_path) assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}" subject_answer = audio_scripts[file_name][self._subject] diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py index 6710b277ff..1a1218bfd5 100644 --- a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py @@ -1,6 +1,5 @@ from typing import List import os -import os.path as osp from helm.benchmark.scenarios.scenario import ( Scenario, @@ -13,30 +12,17 @@ ) from tqdm import tqdm from helm.common.media_object import MediaObject, MultimediaObject -from helm.common.general import ensure_file_downloaded +from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file import pandas as pd -from glob import glob -from pydub import AudioSegment -from multiprocessing import Pool - - -def _m4a_to_wav(input_path, output_path): - audio = AudioSegment.from_file(input_path, format="m4a") - audio.export(output_path, format="wav") - - -def _preprocess_single_sample(audio_path): - assert osp.exists(audio_path), f"Audio file does not exist at path: {audio_path}" - audio_path_wav = audio_path[:-3] + "wav" - audio = AudioSegment.from_file(audio_path, format="m4a") - audio.export(audio_path_wav, format="wav") class VoxCeleb2Scenario(Scenario): """VoxCeleb2 - VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from - interview videos uploaded to YouTube. + VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from + interview videos uploaded to YouTube. This dataset contains over a million utterances from over + 6,000 speakers. Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf @@ -53,42 +39,67 @@ class VoxCeleb2Scenario(Scenario): REFERENCE_URL = ( "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv" ) + IDENTITY_INSTRUCTION = ( + "Listen to the audio and take your best guess to determine if the two speakers are the same person." + ) name = "voxceleb2" - description = "A large-scale dataset of about 46K audio clips to human-written text pairs \ - ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))." + description = ( + "A large-scale dataset of over a million utterances from over 6,000 speakers with their" + "gender, race, identity information" + "([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))." + ) tags: List[str] = ["audio", "identification"] + options: List[str] = ["Yes", "No"] + + def _convert_answer_to_label(self, answer: bool) -> str: + if answer: + return "A" + else: + return "B" + + def _reformat_and_convert_audio_file( + self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str + ) -> str: + tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav") + ensure_directory_exists(os.path.dirname(tgt_audio_path)) + use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path) + return tgt_audio_path def get_instances(self, output_path: str) -> List[Instance]: instances: List[Instance] = [] - data_root = osp.join(output_path, "data/test/aac") - ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=data_root, unpack=True) - df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",") - - df["first"] = df["first"].apply(lambda x: osp.join(data_root, x)) - df["second"] = df["second"].apply(lambda x: osp.join(data_root, x)) - - all_paths = set(df["first"].to_list() + df["second"].to_list()) - with Pool(processes=4) as pool: - list(tqdm(pool.imap(_preprocess_single_sample, all_paths), total=len(all_paths))) - + audio_data_path = os.path.join(output_path, "audio_files") + tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files") + ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True) + annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",") instances = [] + for _, row in tqdm(annotations.iterrows(), total=len(annotations)): + tgt_first_audio_path = self._reformat_and_convert_audio_file( + row["first"], tgt_audio_data_path, audio_data_path + ) + tgt_second_audio_path = self._reformat_and_convert_audio_file( + row["second"], tgt_audio_data_path, audio_data_path + ) - for _, row in tqdm(df.iterrows(), total=len(df)): - first = row["first"][:-3] + "wav" - second = row["second"][:-3] + "wav" - same = "True" if row["same"] else "False" + answer = self._convert_answer_to_label(row["same"]) + # The given correct answer is a letter, but we need an index + correct_answer_index: int = ord(answer) - ord("A") + references: List[Reference] = [] + for i, option in enumerate(self.options): + reference: Reference + is_correct: bool = i == correct_answer_index + reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else []) + references.append(reference) input = Input( multimedia_content=MultimediaObject( [ - MediaObject(content_type="audio/wav", location=first), - MediaObject(content_type="audio/wav", location=second), + MediaObject(content_type="audio/wav", location=tgt_first_audio_path), + MediaObject(content_type="audio/wav", location=tgt_second_audio_path), + MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION), ] ) ) - - references = [Reference(Output(text=same), tags=[CORRECT_TAG])] instances.append(Instance(input=input, references=references, split=TEST_SPLIT)) return instances diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml index d1617ea524..79c6f99163 100644 --- a/src/helm/benchmark/static/schema_speech.yaml +++ b/src/helm/benchmark/static/schema_speech.yaml @@ -323,24 +323,23 @@ run_groups: - name: voxceleb2 display_name: VoxCeleb2 description: > - AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected - via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds, - musical instruments and genres, and common everyday environmental sounds. - ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)). VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from - interview videos uploaded to YouTube.([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf)) + interview videos uploaded to YouTube. It contains over a million utterances from over 6,000 + speakers with their gender, race, identity information in 145 different nationalities, covering + a wide range of accents, ages, ethnicities and languages. + ([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf)) metric_groups: - accuracy - general_information environment: - main_name: cider + main_name: exact_match main_split: test taxonomy: task: audio identification what: audio clips in the wild who: real speakers when: "2018" - language: 145 languages + language: English, Germany, French - name: common_voice_15 display_name: Common Voice 15 diff --git a/src/helm/common/audio_utils.py b/src/helm/common/audio_utils.py index 1690623df0..a355d8cb6c 100644 --- a/src/helm/common/audio_utils.py +++ b/src/helm/common/audio_utils.py @@ -5,6 +5,7 @@ import numpy as np import soundfile as sf +import subprocess from helm.common.multimodal_request_utils import get_contents_as_bytes from helm.common.optional_dependencies import handle_module_not_found_error @@ -42,3 +43,22 @@ def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarr # librosa accepts a local file path or a file-like object audio_array, _ = librosa.load(audio_file, sr=sample_rate) return audio_array + + +def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None: + if os.path.exists(output_path): + return + """Use ffmpeg to convert an audio file type""" + try: + subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.") + + +def use_ffmpeg_to_extract_audio_from_video(input_video_path: str, output_audio_path: str) -> None: + if os.path.exists(output_audio_path): + return + try: + subprocess.run(["ffmpeg", "-i", input_video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")