From 0f7f3b4c35f1be670e8998f5168a612e915e523c Mon Sep 17 00:00:00 2001
From: Siwei Yang <swyang.ac@gmail.com>
Date: Thu, 21 Nov 2024 18:30:44 -0800
Subject: [PATCH 1/3] add VoxCeleb2Scenario for gender classification

---
 .../presentation/run_entries_speech.conf      |  1 +
 .../benchmark/run_specs/audio_run_specs.py    | 21 +++++
 .../audio_language/voxceleb2_scenario.py      | 87 +++++++++++++++++++
 src/helm/benchmark/static/schema_speech.yaml  | 22 +++++
 4 files changed, 131 insertions(+)
 create mode 100644 src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py

diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
index 68276b629a..e89a61208d 100644
--- a/src/helm/benchmark/presentation/run_entries_speech.conf
+++ b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -5,6 +5,7 @@ entries: [
     {description: "meld_audio:model=audiolm", priority: 1}
     {description: "vocal_sound:model=audiolm", priority: 1}
     {description: "audiocaps:model=audiolm", priority: 1}
+    {description: "voxceleb2:model=audiolm", priority: 1}
 
     ####################################################################################################################
     # Fairness
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
index ccccb1a08d..5b36c1569a 100644
--- a/src/helm/benchmark/run_specs/audio_run_specs.py
+++ b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -266,6 +266,27 @@ def get_audiocaps_run_spec() -> RunSpec:
     )
 
 
+@run_spec_function("voxceleb2")
+def get_voxceleb2_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario"
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Determine the gender of the following audio's speaker. Reply only with 'Male' or 'Female'. "
+        "Don't include any explanation",
+        max_tokens=50,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "voxceleb2"
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("common_voice_15")
 def get_common_voice_15_run_spec(language: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
new file mode 100644
index 0000000000..f85fb0ccb7
--- /dev/null
+++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
@@ -0,0 +1,87 @@
+from typing import List
+import os
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+import pandas as pd
+from glob import glob
+from pydub import AudioSegment
+from multiprocessing import Pool
+
+
+def _m4a_to_wav(input_path, output_path):
+    audio = AudioSegment.from_file(input_path, format="m4a")
+    audio.export(output_path, format="wav")
+
+
+def _process_single_sample(audio_path_gender_pair):
+    audio_path, gender = audio_path_gender_pair
+    audio_path_wav = audio_path[:-3] + "wav"
+    _m4a_to_wav(audio_path, audio_path_wav)
+    input = Input(
+        multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path_wav)])
+    )
+    references = [Reference(Output(text=gender), tags=[CORRECT_TAG])]
+    return Instance(input=input, references=references, split=TEST_SPLIT)
+
+
+class VoxCeleb2Scenario(Scenario):
+    """VoxCeleb2
+
+    VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from 
+    interview videos uploaded to YouTube.
+
+    Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
+
+    Citation:   
+    @inproceedings{Chung18b,
+        author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
+        title = "VoxCeleb2: Deep Speaker Recognition",
+        booktitle = "INTERSPEECH",
+        year = "2018",
+    }
+    """
+
+    DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
+    REFERENCE_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_meta.csv"
+
+    name = "voxceleb2"
+    description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
+        ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
+    tags: List[str] = ["audio", "classification"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        data_root = os.path.join(output_path, "data/test/aac")
+        ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=data_root, unpack=True)
+        df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=" ,")
+        df = df[df["Set"] == "test"]
+        df = df[df["VoxCeleb2 ID"].apply(lambda x: x not in ["id04170", "id05348"])]
+
+        audio_path_gender_pairs = []
+
+        for _, row in tqdm(df.iterrows(), total=len(df)):
+            vox_celeb2_id = row["VoxCeleb2 ID"]
+            gender = "Male" if row["Gender"] == "m" else "Female"
+            audio_dir = os.path.join(data_root, vox_celeb2_id)
+            assert os.path.exists(audio_dir), f"Audio file does not exist at path: {audio_dir}"
+
+            audio_paths = glob(os.path.join(audio_dir, "**/*.m4a"), recursive=True)
+            audio_paths = sorted(audio_paths)
+
+            audio_path_gender_pairs += [(audio_path, gender) for audio_path in audio_paths]
+
+        with Pool(processes=4) as pool:
+            instances = pool.map(_process_single_sample, audio_path_gender_pairs)
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml
index 93a71e1bd2..226fe196b9 100644
--- a/src/helm/benchmark/static/schema_speech.yaml
+++ b/src/helm/benchmark/static/schema_speech.yaml
@@ -320,6 +320,28 @@ run_groups:
       when: "2019"
       language: English
 
+  - name: voxceleb2
+    display_name: VoxCeleb2
+    description: >
+      AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected 
+      via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds, 
+      musical instruments and genres, and common everyday environmental sounds. 
+      ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)).
+      VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from 
+      interview videos uploaded to YouTube.([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: cider
+      main_split: test
+    taxonomy:
+      task: audio gender classification
+      what: audio clips in the wild
+      who: real speakers
+      when: "2018"
+      language: 145 languages
+
   - name: common_voice_15
     display_name: Common Voice 15
     description: >

From 4ab0c85d136482e47df0134f28e4df393ac46ace Mon Sep 17 00:00:00 2001
From: Siwei Yang <swyang.ac@gmail.com>
Date: Thu, 21 Nov 2024 19:55:29 -0800
Subject: [PATCH 2/3] change VoxCeleb2Scenario to a audio identification task

---
 .../benchmark/run_specs/audio_run_specs.py    |  4 +-
 .../audio_language/voxceleb2_scenario.py      | 63 ++++++++++---------
 src/helm/benchmark/static/schema_speech.yaml  |  2 +-
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
index 5b36c1569a..66a424a0e7 100644
--- a/src/helm/benchmark/run_specs/audio_run_specs.py
+++ b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -272,8 +272,8 @@ def get_voxceleb2_run_spec() -> RunSpec:
         class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario"
     )
     adapter_spec = _get_generation_adapter_spec(
-        instructions="Determine the gender of the following audio's speaker. Reply only with 'Male' or 'Female'. "
-        "Don't include any explanation",
+        instructions="Determine whether the speakers in the following two audio clips are the same person. "
+        "Reply only with 'True' or 'False'. Don't include any explanation",
         max_tokens=50,
     )
     metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
index f85fb0ccb7..6710b277ff 100644
--- a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
+++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
@@ -1,5 +1,6 @@
 from typing import List
 import os
+import os.path as osp
 
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -24,26 +25,22 @@ def _m4a_to_wav(input_path, output_path):
     audio.export(output_path, format="wav")
 
 
-def _process_single_sample(audio_path_gender_pair):
-    audio_path, gender = audio_path_gender_pair
+def _preprocess_single_sample(audio_path):
+    assert osp.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
     audio_path_wav = audio_path[:-3] + "wav"
-    _m4a_to_wav(audio_path, audio_path_wav)
-    input = Input(
-        multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path_wav)])
-    )
-    references = [Reference(Output(text=gender), tags=[CORRECT_TAG])]
-    return Instance(input=input, references=references, split=TEST_SPLIT)
+    audio = AudioSegment.from_file(audio_path, format="m4a")
+    audio.export(audio_path_wav, format="wav")
 
 
 class VoxCeleb2Scenario(Scenario):
     """VoxCeleb2
 
-    VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from 
+    VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from
     interview videos uploaded to YouTube.
 
     Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
 
-    Citation:   
+    Citation:
     @inproceedings{Chung18b,
         author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
         title = "VoxCeleb2: Deep Speaker Recognition",
@@ -53,35 +50,45 @@ class VoxCeleb2Scenario(Scenario):
     """
 
     DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
-    REFERENCE_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_meta.csv"
+    REFERENCE_URL = (
+        "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
+    )
 
     name = "voxceleb2"
     description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
         ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
-    tags: List[str] = ["audio", "classification"]
+    tags: List[str] = ["audio", "identification"]
 
     def get_instances(self, output_path: str) -> List[Instance]:
         instances: List[Instance] = []
-        data_root = os.path.join(output_path, "data/test/aac")
+        data_root = osp.join(output_path, "data/test/aac")
         ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=data_root, unpack=True)
-        df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=" ,")
-        df = df[df["Set"] == "test"]
-        df = df[df["VoxCeleb2 ID"].apply(lambda x: x not in ["id04170", "id05348"])]
+        df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
 
-        audio_path_gender_pairs = []
-
-        for _, row in tqdm(df.iterrows(), total=len(df)):
-            vox_celeb2_id = row["VoxCeleb2 ID"]
-            gender = "Male" if row["Gender"] == "m" else "Female"
-            audio_dir = os.path.join(data_root, vox_celeb2_id)
-            assert os.path.exists(audio_dir), f"Audio file does not exist at path: {audio_dir}"
+        df["first"] = df["first"].apply(lambda x: osp.join(data_root, x))
+        df["second"] = df["second"].apply(lambda x: osp.join(data_root, x))
 
-            audio_paths = glob(os.path.join(audio_dir, "**/*.m4a"), recursive=True)
-            audio_paths = sorted(audio_paths)
+        all_paths = set(df["first"].to_list() + df["second"].to_list())
+        with Pool(processes=4) as pool:
+            list(tqdm(pool.imap(_preprocess_single_sample, all_paths), total=len(all_paths)))
 
-            audio_path_gender_pairs += [(audio_path, gender) for audio_path in audio_paths]
+        instances = []
 
-        with Pool(processes=4) as pool:
-            instances = pool.map(_process_single_sample, audio_path_gender_pairs)
+        for _, row in tqdm(df.iterrows(), total=len(df)):
+            first = row["first"][:-3] + "wav"
+            second = row["second"][:-3] + "wav"
+            same = "True" if row["same"] else "False"
+
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    [
+                        MediaObject(content_type="audio/wav", location=first),
+                        MediaObject(content_type="audio/wav", location=second),
+                    ]
+                )
+            )
+
+            references = [Reference(Output(text=same), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
 
         return instances
diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml
index 226fe196b9..d1617ea524 100644
--- a/src/helm/benchmark/static/schema_speech.yaml
+++ b/src/helm/benchmark/static/schema_speech.yaml
@@ -336,7 +336,7 @@ run_groups:
       main_name: cider
       main_split: test
     taxonomy:
-      task: audio gender classification
+      task: audio identification
       what: audio clips in the wild
       who: real speakers
       when: "2018"

From dcd10c583ef3b59e47070e6c757a742d14f4f038 Mon Sep 17 00:00:00 2001
From: ImKeTT <tuisaac163@gmail.com>
Date: Fri, 22 Nov 2024 15:13:18 -0800
Subject: [PATCH 3/3] fix

---
 .../benchmark/run_specs/audio_run_specs.py    |  6 +-
 .../casual_conversations2_scenario.py         | 10 +-
 .../audio_language/voxceleb2_scenario.py      | 91 +++++++++++--------
 src/helm/benchmark/static/schema_speech.yaml  | 13 ++-
 src/helm/common/audio_utils.py                | 20 ++++
 5 files changed, 81 insertions(+), 59 deletions(-)

diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
index 66a424a0e7..97fcddd55f 100644
--- a/src/helm/benchmark/run_specs/audio_run_specs.py
+++ b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -271,10 +271,8 @@ def get_voxceleb2_run_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario"
     )
-    adapter_spec = _get_generation_adapter_spec(
-        instructions="Determine whether the speakers in the following two audio clips are the same person. "
-        "Reply only with 'True' or 'False'. Don't include any explanation",
-        max_tokens=50,
+    adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
+        input_noun=None, output_noun="Answer", max_train_instances=0
     )
     metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
     run_spec_name: str = "voxceleb2"
diff --git a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
index 93c9cd812c..783c9c9fa9 100644
--- a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
+++ b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
@@ -15,6 +15,7 @@
 )
 from helm.common.media_object import MediaObject, MultimediaObject
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.common.audio_utils import use_ffmpeg_to_extract_audio_from_video
 
 
 class CasualConversations2Scenario(Scenario):
@@ -72,12 +73,6 @@ def __init__(self, subject: str) -> None:
         self.options = self.age_options if subject == "age" else self.gender_options
         self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION
 
-    def _extract_audio_from_video(self, input_video_path: str, output_audio_path: str) -> None:
-        try:
-            os.system(f"ffmpeg -i {input_video_path} -q:a 0 -map a {output_audio_path}")
-        except Exception:
-            raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")
-
     def _convert_age_to_label(self, age: str) -> str:
         if age != "prefer not to say":
             age_int = int(age)
@@ -128,8 +123,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
             if file_name.endswith(".mp4"):
                 local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
                 local_video_path: str = os.path.join(data_dir, file_name)
-                if not os.path.exists(local_audio_path):
-                    self._extract_audio_from_video(local_video_path, local_audio_path)
+                use_ffmpeg_to_extract_audio_from_video(local_video_path, local_audio_path)
                 assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"
 
                 subject_answer = audio_scripts[file_name][self._subject]
diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
index 6710b277ff..1a1218bfd5 100644
--- a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
+++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
@@ -1,6 +1,5 @@
 from typing import List
 import os
-import os.path as osp
 
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -13,30 +12,17 @@
 )
 from tqdm import tqdm
 from helm.common.media_object import MediaObject, MultimediaObject
-from helm.common.general import ensure_file_downloaded
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file
 import pandas as pd
-from glob import glob
-from pydub import AudioSegment
-from multiprocessing import Pool
-
-
-def _m4a_to_wav(input_path, output_path):
-    audio = AudioSegment.from_file(input_path, format="m4a")
-    audio.export(output_path, format="wav")
-
-
-def _preprocess_single_sample(audio_path):
-    assert osp.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
-    audio_path_wav = audio_path[:-3] + "wav"
-    audio = AudioSegment.from_file(audio_path, format="m4a")
-    audio.export(audio_path_wav, format="wav")
 
 
 class VoxCeleb2Scenario(Scenario):
     """VoxCeleb2
 
-    VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from
-    interview videos uploaded to YouTube.
+    VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from
+    interview videos uploaded to YouTube. This dataset contains over a million utterances from over
+    6,000 speakers.
 
     Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
 
@@ -53,42 +39,67 @@ class VoxCeleb2Scenario(Scenario):
     REFERENCE_URL = (
         "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
     )
+    IDENTITY_INSTRUCTION = (
+        "Listen to the audio and take your best guess to determine if the two speakers are the same person."
+    )
 
     name = "voxceleb2"
-    description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
-        ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
+    description = (
+        "A large-scale dataset of over a million utterances from over 6,000 speakers with their"
+        "gender, race, identity information"
+        "([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))."
+    )
     tags: List[str] = ["audio", "identification"]
+    options: List[str] = ["Yes", "No"]
+
+    def _convert_answer_to_label(self, answer: bool) -> str:
+        if answer:
+            return "A"
+        else:
+            return "B"
+
+    def _reformat_and_convert_audio_file(
+        self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str
+    ) -> str:
+        tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav")
+        ensure_directory_exists(os.path.dirname(tgt_audio_path))
+        use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path)
+        return tgt_audio_path
 
     def get_instances(self, output_path: str) -> List[Instance]:
         instances: List[Instance] = []
-        data_root = osp.join(output_path, "data/test/aac")
-        ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=data_root, unpack=True)
-        df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
-
-        df["first"] = df["first"].apply(lambda x: osp.join(data_root, x))
-        df["second"] = df["second"].apply(lambda x: osp.join(data_root, x))
-
-        all_paths = set(df["first"].to_list() + df["second"].to_list())
-        with Pool(processes=4) as pool:
-            list(tqdm(pool.imap(_preprocess_single_sample, all_paths), total=len(all_paths)))
-
+        audio_data_path = os.path.join(output_path, "audio_files")
+        tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files")
+        ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True)
+        annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
         instances = []
+        for _, row in tqdm(annotations.iterrows(), total=len(annotations)):
+            tgt_first_audio_path = self._reformat_and_convert_audio_file(
+                row["first"], tgt_audio_data_path, audio_data_path
+            )
+            tgt_second_audio_path = self._reformat_and_convert_audio_file(
+                row["second"], tgt_audio_data_path, audio_data_path
+            )
 
-        for _, row in tqdm(df.iterrows(), total=len(df)):
-            first = row["first"][:-3] + "wav"
-            second = row["second"][:-3] + "wav"
-            same = "True" if row["same"] else "False"
+            answer = self._convert_answer_to_label(row["same"])
+            # The given correct answer is a letter, but we need an index
+            correct_answer_index: int = ord(answer) - ord("A")
+            references: List[Reference] = []
+            for i, option in enumerate(self.options):
+                reference: Reference
+                is_correct: bool = i == correct_answer_index
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
+                references.append(reference)
 
             input = Input(
                 multimedia_content=MultimediaObject(
                     [
-                        MediaObject(content_type="audio/wav", location=first),
-                        MediaObject(content_type="audio/wav", location=second),
+                        MediaObject(content_type="audio/wav", location=tgt_first_audio_path),
+                        MediaObject(content_type="audio/wav", location=tgt_second_audio_path),
+                        MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION),
                     ]
                 )
             )
-
-            references = [Reference(Output(text=same), tags=[CORRECT_TAG])]
             instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
 
         return instances
diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml
index d1617ea524..79c6f99163 100644
--- a/src/helm/benchmark/static/schema_speech.yaml
+++ b/src/helm/benchmark/static/schema_speech.yaml
@@ -323,24 +323,23 @@ run_groups:
   - name: voxceleb2
     display_name: VoxCeleb2
     description: >
-      AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected 
-      via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds, 
-      musical instruments and genres, and common everyday environmental sounds. 
-      ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)).
       VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from 
-      interview videos uploaded to YouTube.([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
+      interview videos uploaded to YouTube. It contains over a million utterances from over 6,000 
+      speakers with their gender, race, identity information in 145 different nationalities, covering 
+      a wide range of accents, ages, ethnicities and languages.
+      ([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
     metric_groups:
       - accuracy
       - general_information
     environment:
-      main_name: cider
+      main_name: exact_match
       main_split: test
     taxonomy:
       task: audio identification
       what: audio clips in the wild
       who: real speakers
       when: "2018"
-      language: 145 languages
+      language: English, Germany, French
 
   - name: common_voice_15
     display_name: Common Voice 15
diff --git a/src/helm/common/audio_utils.py b/src/helm/common/audio_utils.py
index 1690623df0..a355d8cb6c 100644
--- a/src/helm/common/audio_utils.py
+++ b/src/helm/common/audio_utils.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import soundfile as sf
+import subprocess
 
 from helm.common.multimodal_request_utils import get_contents_as_bytes
 from helm.common.optional_dependencies import handle_module_not_found_error
@@ -42,3 +43,22 @@ def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarr
     # librosa accepts a local file path or a file-like object
     audio_array, _ = librosa.load(audio_file, sr=sample_rate)
     return audio_array
+
+
+def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
+    if os.path.exists(output_path):
+        return
+    """Use ffmpeg to convert an audio file type"""
+    try:
+        subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")
+
+
+def use_ffmpeg_to_extract_audio_from_video(input_video_path: str, output_audio_path: str) -> None:
+    if os.path.exists(output_audio_path):
+        return
+    try:
+        subprocess.run(["ffmpeg", "-i", input_video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")