stanford-crfm · ImKeTT · Nov 23, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -5,6 +5,7 @@ entries: [
     {description: "meld_audio:model=audiolm", priority: 1}
     {description: "vocal_sound:model=audiolm", priority: 1}
     {description: "audiocaps:model=audiolm", priority: 1}
+    {description: "voxceleb2:model=audiolm", priority: 1}
 
     ####################################################################################################################
     # Fairness

diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -266,6 +266,27 @@ def get_audiocaps_run_spec() -> RunSpec:
     )
 
 
+@run_spec_function("voxceleb2")
+def get_voxceleb2_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario"
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Determine whether the speakers in the following two audio clips are the same person. "
+        "Reply only with 'True' or 'False'. Don't include any explanation",
+        max_tokens=50,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "voxceleb2"
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("common_voice_15")
 def get_common_voice_15_run_spec(language: str) -> RunSpec:
     scenario_spec = ScenarioSpec(

diff --git a/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py b/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
@@ -0,0 +1,94 @@
+from typing import List
+import os
+import os.path as osp
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+import pandas as pd
+from glob import glob
+from pydub import AudioSegment
+from multiprocessing import Pool
+
+
+def _m4a_to_wav(input_path, output_path):
+    audio = AudioSegment.from_file(input_path, format="m4a")
+    audio.export(output_path, format="wav")
+
+
+def _preprocess_single_sample(audio_path):
+    assert osp.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
+    audio_path_wav = audio_path[:-3] + "wav"
+    audio = AudioSegment.from_file(audio_path, format="m4a")
+    audio.export(audio_path_wav, format="wav")
+
+
+class VoxCeleb2Scenario(Scenario):
+    """VoxCeleb2
+
+    VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from
+    interview videos uploaded to YouTube.
+
+    Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
+
+    Citation:
+    @inproceedings{Chung18b,
+        author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
+        title = "VoxCeleb2: Deep Speaker Recognition",
+        booktitle = "INTERSPEECH",
+        year = "2018",
+    }
+    """
+
+    DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
+    REFERENCE_URL = (
+        "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
+    )
+
+    name = "voxceleb2"
+    description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
+        ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
+    tags: List[str] = ["audio", "identification"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        data_root = osp.join(output_path, "data/test/aac")
+        ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=data_root, unpack=True)
+        df = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
+
+        df["first"] = df["first"].apply(lambda x: osp.join(data_root, x))
+        df["second"] = df["second"].apply(lambda x: osp.join(data_root, x))
+
+        all_paths = set(df["first"].to_list() + df["second"].to_list())
+        with Pool(processes=4) as pool:
+            list(tqdm(pool.imap(_preprocess_single_sample, all_paths), total=len(all_paths)))
+
+        instances = []
+
+        for _, row in tqdm(df.iterrows(), total=len(df)):
+            first = row["first"][:-3] + "wav"
+            second = row["second"][:-3] + "wav"
+            same = "True" if row["same"] else "False"
+
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    [
+                        MediaObject(content_type="audio/wav", location=first),
+                        MediaObject(content_type="audio/wav", location=second),
+                    ]
+                )
+            )
+
+            references = [Reference(Output(text=same), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml
@@ -320,6 +320,28 @@ run_groups:
       when: "2019"
       language: English
 
+  - name: voxceleb2
+    display_name: VoxCeleb2
+    description: >
+      AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected 
+      via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds, 
+      musical instruments and genres, and common everyday environmental sounds. 
+      ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)).
+      VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from 
+      interview videos uploaded to YouTube.([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: cider
+      main_split: test
+    taxonomy:
+      task: audio identification
+      what: audio clips in the wild
+      who: real speakers
+      when: "2018"
+      language: 145 languages
+
   - name: common_voice_15
     display_name: Common Voice 15
     description: >