diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf index 68276b629a..e89a61208d 100644 --- a/src/helm/benchmark/presentation/run_entries_speech.conf +++ b/src/helm/benchmark/presentation/run_entries_speech.conf @@ -5,6 +5,7 @@ entries: [ {description: "meld_audio:model=audiolm", priority: 1} {description: "vocal_sound:model=audiolm", priority: 1} {description: "audiocaps:model=audiolm", priority: 1} + {description: "voxceleb2:model=audiolm", priority: 1} #################################################################################################################### # Fairness diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py index ccccb1a08d..97fcddd55f 100644 --- a/src/helm/benchmark/run_specs/audio_run_specs.py +++ b/src/helm/benchmark/run_specs/audio_run_specs.py @@ -266,6 +266,25 @@ def get_audiocaps_run_spec() -> RunSpec: ) +@run_spec_function("voxceleb2") +def get_voxceleb2_run_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario" + ) + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( + input_noun=None, output_noun="Answer", max_train_instances=0 + ) + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + run_spec_name: str = "voxceleb2" + return RunSpec( + name=run_spec_name, + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=[run_spec_name], + ) + + @run_spec_function("common_voice_15") def get_common_voice_15_run_spec(language: str) -> RunSpec: scenario_spec = ScenarioSpec( diff --git a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py index 93c9cd812c..783c9c9fa9 100644 --- a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +++ b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py @@ -15,6 +15,7 @@ ) from helm.common.media_object import MediaObject, MultimediaObject from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from helm.common.audio_utils import use_ffmpeg_to_extract_audio_from_video class CasualConversations2Scenario(Scenario): @@ -72,12 +73,6 @@ def __init__(self, subject: str) -> None: self.options = self.age_options if subject == "age" else self.gender_options self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION - def _extract_audio_from_video(self, input_video_path: str, output_audio_path: str) -> None: - try: - os.system(f"ffmpeg -i {input_video_path} -q:a 0 -map a {output_audio_path}") - except Exception: - raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.") - def _convert_age_to_label(self, age: str) -> str: if age != "prefer not to say": age_int = int(age) @@ -128,8 +123,7 @@ def get_instances(self, output_path: str) -> List[Instance]: if file_name.endswith(".mp4"): local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3")) local_video_path: str = os.path.join(data_dir, file_name) - if not os.path.exists(local_audio_path): - self._extract_audio_from_video(local_video_path, local_audio_path) + use_ffmpeg_to_extract_audio_from_video(local_video_path, local_audio_path) assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}" subject_answer = audio_scripts[file_name][self._subject] diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py new file mode 100644 index 0000000000..1a1218bfd5 --- /dev/null +++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py @@ -0,0 +1,105 @@ +from typing import List +import os + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) +from tqdm import tqdm +from helm.common.media_object import MediaObject, MultimediaObject +from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file +import pandas as pd + + +class VoxCeleb2Scenario(Scenario): + """VoxCeleb2 + + VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from + interview videos uploaded to YouTube. This dataset contains over a million utterances from over + 6,000 speakers. + + Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf + + Citation: + @inproceedings{Chung18b, + author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.", + title = "VoxCeleb2: Deep Speaker Recognition", + booktitle = "INTERSPEECH", + year = "2018", + } + """ + + DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip" + REFERENCE_URL = ( + "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv" + ) + IDENTITY_INSTRUCTION = ( + "Listen to the audio and take your best guess to determine if the two speakers are the same person." + ) + + name = "voxceleb2" + description = ( + "A large-scale dataset of over a million utterances from over 6,000 speakers with their" + "gender, race, identity information" + "([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))." + ) + tags: List[str] = ["audio", "identification"] + options: List[str] = ["Yes", "No"] + + def _convert_answer_to_label(self, answer: bool) -> str: + if answer: + return "A" + else: + return "B" + + def _reformat_and_convert_audio_file( + self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str + ) -> str: + tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav") + ensure_directory_exists(os.path.dirname(tgt_audio_path)) + use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path) + return tgt_audio_path + + def get_instances(self, output_path: str) -> List[Instance]: + instances: List[Instance] = [] + audio_data_path = os.path.join(output_path, "audio_files") + tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files") + ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True) + annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",") + instances = [] + for _, row in tqdm(annotations.iterrows(), total=len(annotations)): + tgt_first_audio_path = self._reformat_and_convert_audio_file( + row["first"], tgt_audio_data_path, audio_data_path + ) + tgt_second_audio_path = self._reformat_and_convert_audio_file( + row["second"], tgt_audio_data_path, audio_data_path + ) + + answer = self._convert_answer_to_label(row["same"]) + # The given correct answer is a letter, but we need an index + correct_answer_index: int = ord(answer) - ord("A") + references: List[Reference] = [] + for i, option in enumerate(self.options): + reference: Reference + is_correct: bool = i == correct_answer_index + reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else []) + references.append(reference) + + input = Input( + multimedia_content=MultimediaObject( + [ + MediaObject(content_type="audio/wav", location=tgt_first_audio_path), + MediaObject(content_type="audio/wav", location=tgt_second_audio_path), + MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION), + ] + ) + ) + instances.append(Instance(input=input, references=references, split=TEST_SPLIT)) + + return instances diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml index 93a71e1bd2..79c6f99163 100644 --- a/src/helm/benchmark/static/schema_speech.yaml +++ b/src/helm/benchmark/static/schema_speech.yaml @@ -320,6 +320,27 @@ run_groups: when: "2019" language: English + - name: voxceleb2 + display_name: VoxCeleb2 + description: > + VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from + interview videos uploaded to YouTube. It contains over a million utterances from over 6,000 + speakers with their gender, race, identity information in 145 different nationalities, covering + a wide range of accents, ages, ethnicities and languages. + ([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf)) + metric_groups: + - accuracy + - general_information + environment: + main_name: exact_match + main_split: test + taxonomy: + task: audio identification + what: audio clips in the wild + who: real speakers + when: "2018" + language: English, Germany, French + - name: common_voice_15 display_name: Common Voice 15 description: > diff --git a/src/helm/common/audio_utils.py b/src/helm/common/audio_utils.py index 1690623df0..a355d8cb6c 100644 --- a/src/helm/common/audio_utils.py +++ b/src/helm/common/audio_utils.py @@ -5,6 +5,7 @@ import numpy as np import soundfile as sf +import subprocess from helm.common.multimodal_request_utils import get_contents_as_bytes from helm.common.optional_dependencies import handle_module_not_found_error @@ -42,3 +43,22 @@ def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarr # librosa accepts a local file path or a file-like object audio_array, _ = librosa.load(audio_file, sr=sample_rate) return audio_array + + +def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None: + if os.path.exists(output_path): + return + """Use ffmpeg to convert an audio file type""" + try: + subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.") + + +def use_ffmpeg_to_extract_audio_from_video(input_video_path: str, output_audio_path: str) -> None: + if os.path.exists(output_audio_path): + return + try: + subprocess.run(["ffmpeg", "-i", input_video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")