diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
index 68276b629a..e89a61208d 100644
--- a/src/helm/benchmark/presentation/run_entries_speech.conf
+++ b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -5,6 +5,7 @@ entries: [
     {description: "meld_audio:model=audiolm", priority: 1}
     {description: "vocal_sound:model=audiolm", priority: 1}
     {description: "audiocaps:model=audiolm", priority: 1}
+    {description: "voxceleb2:model=audiolm", priority: 1}
 
     ####################################################################################################################
     # Fairness
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
index ccccb1a08d..97fcddd55f 100644
--- a/src/helm/benchmark/run_specs/audio_run_specs.py
+++ b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -266,6 +266,25 @@ def get_audiocaps_run_spec() -> RunSpec:
     )
 
 
+@run_spec_function("voxceleb2")
+def get_voxceleb2_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario"
+    )
+    adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
+        input_noun=None, output_noun="Answer", max_train_instances=0
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "voxceleb2"
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("common_voice_15")
 def get_common_voice_15_run_spec(language: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
diff --git a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
index 93c9cd812c..783c9c9fa9 100644
--- a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
+++ b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
@@ -15,6 +15,7 @@
 )
 from helm.common.media_object import MediaObject, MultimediaObject
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.common.audio_utils import use_ffmpeg_to_extract_audio_from_video
 
 
 class CasualConversations2Scenario(Scenario):
@@ -72,12 +73,6 @@ def __init__(self, subject: str) -> None:
         self.options = self.age_options if subject == "age" else self.gender_options
         self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION
 
-    def _extract_audio_from_video(self, input_video_path: str, output_audio_path: str) -> None:
-        try:
-            os.system(f"ffmpeg -i {input_video_path} -q:a 0 -map a {output_audio_path}")
-        except Exception:
-            raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")
-
     def _convert_age_to_label(self, age: str) -> str:
         if age != "prefer not to say":
             age_int = int(age)
@@ -128,8 +123,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
             if file_name.endswith(".mp4"):
                 local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
                 local_video_path: str = os.path.join(data_dir, file_name)
-                if not os.path.exists(local_audio_path):
-                    self._extract_audio_from_video(local_video_path, local_audio_path)
+                use_ffmpeg_to_extract_audio_from_video(local_video_path, local_audio_path)
                 assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"
 
                 subject_answer = audio_scripts[file_name][self._subject]
diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
new file mode 100644
index 0000000000..1a1218bfd5
--- /dev/null
+++ b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
@@ -0,0 +1,105 @@
+from typing import List
+import os
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file
+import pandas as pd
+
+
+class VoxCeleb2Scenario(Scenario):
+    """VoxCeleb2
+
+    VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from
+    interview videos uploaded to YouTube. This dataset contains over a million utterances from over
+    6,000 speakers.
+
+    Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
+
+    Citation:
+    @inproceedings{Chung18b,
+        author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
+        title = "VoxCeleb2: Deep Speaker Recognition",
+        booktitle = "INTERSPEECH",
+        year = "2018",
+    }
+    """
+
+    DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
+    REFERENCE_URL = (
+        "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
+    )
+    IDENTITY_INSTRUCTION = (
+        "Listen to the audio and take your best guess to determine if the two speakers are the same person."
+    )
+
+    name = "voxceleb2"
+    description = (
+        "A large-scale dataset of over a million utterances from over 6,000 speakers with their"
+        "gender, race, identity information"
+        "([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))."
+    )
+    tags: List[str] = ["audio", "identification"]
+    options: List[str] = ["Yes", "No"]
+
+    def _convert_answer_to_label(self, answer: bool) -> str:
+        if answer:
+            return "A"
+        else:
+            return "B"
+
+    def _reformat_and_convert_audio_file(
+        self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str
+    ) -> str:
+        tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav")
+        ensure_directory_exists(os.path.dirname(tgt_audio_path))
+        use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path)
+        return tgt_audio_path
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        audio_data_path = os.path.join(output_path, "audio_files")
+        tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files")
+        ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True)
+        annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
+        instances = []
+        for _, row in tqdm(annotations.iterrows(), total=len(annotations)):
+            tgt_first_audio_path = self._reformat_and_convert_audio_file(
+                row["first"], tgt_audio_data_path, audio_data_path
+            )
+            tgt_second_audio_path = self._reformat_and_convert_audio_file(
+                row["second"], tgt_audio_data_path, audio_data_path
+            )
+
+            answer = self._convert_answer_to_label(row["same"])
+            # The given correct answer is a letter, but we need an index
+            correct_answer_index: int = ord(answer) - ord("A")
+            references: List[Reference] = []
+            for i, option in enumerate(self.options):
+                reference: Reference
+                is_correct: bool = i == correct_answer_index
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
+                references.append(reference)
+
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    [
+                        MediaObject(content_type="audio/wav", location=tgt_first_audio_path),
+                        MediaObject(content_type="audio/wav", location=tgt_second_audio_path),
+                        MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION),
+                    ]
+                )
+            )
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml
index 93a71e1bd2..79c6f99163 100644
--- a/src/helm/benchmark/static/schema_speech.yaml
+++ b/src/helm/benchmark/static/schema_speech.yaml
@@ -320,6 +320,27 @@ run_groups:
       when: "2019"
       language: English
 
+  - name: voxceleb2
+    display_name: VoxCeleb2
+    description: >
+      VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from 
+      interview videos uploaded to YouTube. It contains over a million utterances from over 6,000 
+      speakers with their gender, race, identity information in 145 different nationalities, covering 
+      a wide range of accents, ages, ethnicities and languages.
+      ([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: audio identification
+      what: audio clips in the wild
+      who: real speakers
+      when: "2018"
+      language: English, Germany, French
+
   - name: common_voice_15
     display_name: Common Voice 15
     description: >
diff --git a/src/helm/common/audio_utils.py b/src/helm/common/audio_utils.py
index 1690623df0..a355d8cb6c 100644
--- a/src/helm/common/audio_utils.py
+++ b/src/helm/common/audio_utils.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import soundfile as sf
+import subprocess
 
 from helm.common.multimodal_request_utils import get_contents_as_bytes
 from helm.common.optional_dependencies import handle_module_not_found_error
@@ -42,3 +43,22 @@ def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarr
     # librosa accepts a local file path or a file-like object
     audio_array, _ = librosa.load(audio_file, sr=sample_rate)
     return audio_array
+
+
+def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
+    if os.path.exists(output_path):
+        return
+    """Use ffmpeg to convert an audio file type"""
+    try:
+        subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")
+
+
+def use_ffmpeg_to_extract_audio_from_video(input_video_path: str, output_audio_path: str) -> None:
+    if os.path.exists(output_audio_path):
+        return
+    try:
+        subprocess.run(["ffmpeg", "-i", input_video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")