Merge branch 'main' of https://github.com/stanford-crfm/helm into spe…

…ech_specs
stanford-crfm · Nov 16, 2024 · eda28a3 · eda28a3
2 parents 749c49b + 145cbe1
commit eda28a3
Show file tree

Hide file tree

Showing 8 changed files with 269 additions and 26 deletions.
diff --git a/helm-frontend/project_metadata.json b/helm-frontend/project_metadata.json
@@ -45,7 +45,7 @@
 		"title": "AIR-Bench",
 		"description": "Safety benchmark based on emerging government regulations and company policies",
 		"id": "air-bench",
-		"releases": ["v1.2.0", "v1.1.0", "v1.0.0"]
+		"releases": ["v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "Safety",

diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -10,6 +10,9 @@ entries: [
     # Fairness
     ####################################################################################################################
 
+    {description: "casual_conversations2:subject=age,model=audiolm", priority: 1}
+    {description: "casual_conversations2:subject=gender,model=audiolm", priority: 1}
+
     ####################################################################################################################
     # Robustness
     ####################################################################################################################

diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -4,7 +4,10 @@
 from helm.benchmark.adaptation.adapter_spec import (
     AdapterSpec,
 )
-from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION_MULTIMODAL
+from helm.benchmark.adaptation.adapters.adapter_factory import (
+    ADAPT_GENERATION_MULTIMODAL,
+    ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
+)
 from helm.benchmark.metrics.common_metric_specs import (
     get_classification_metric_specs,
     get_exact_match_metric_specs,
@@ -42,6 +45,30 @@ def _get_generation_adapter_spec(
     )
 
 
+def _get_multiple_choice_joint_adapter_spec(
+    input_noun: Optional[str],
+    output_noun: str,
+    max_train_instances: int = 0,
+    num_outputs: int = 1,
+) -> AdapterSpec:
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
+        global_prefix="",
+        instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
+        input_prefix=f"{input_noun}: " if input_noun is not None else "",
+        input_suffix="\n",
+        output_prefix=f"{output_noun}: ",
+        output_suffix="\n",
+        instance_prefix="\n",
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        max_tokens=1,
+        stop_sequences=["\n"],
+        temperature=0.0,
+        random=None,
+    )
+
+
 ########################################################################################################################
 # MetricSpecs
 
@@ -78,12 +105,13 @@ def get_audio_mnist_run_spec() -> RunSpec:
         max_tokens=5,
     )
     metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    run_spec_name: str = "audio_mnist"
     return RunSpec(
-        name="audio_mnist",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["audio_mnist"],
+        groups=[run_spec_name],
     )
 
 
@@ -97,12 +125,13 @@ def get_iemocap_audio_run_spec() -> RunSpec:
         max_tokens=5,
     )
     metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    run_spec_name: str = "iemocap_audio"
     return RunSpec(
-        name="iemocap_audio",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["iemocap_audio"],
+        groups=[run_spec_name],
     )
 
 
@@ -116,12 +145,13 @@ def get_meld_audio_run_spec() -> RunSpec:
         max_tokens=5,
     )
     metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    run_spec_name: str = "meld_audio"
     return RunSpec(
-        name="meld_audio",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["meld_audio"],
+        groups=[run_spec_name],
     )
 
 
@@ -156,12 +186,13 @@ def get_vocal_sound_run_spec() -> RunSpec:
         max_tokens=5,
     )
     metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    run_spec_name: str = "vocal_sound"
     return RunSpec(
-        name="vocal_sound",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["vocal_sound"],
+        groups=[run_spec_name],
     )
 
 
@@ -181,12 +212,13 @@ def get_multilingual_librispeech_run_spec(language: str) -> RunSpec:
         metric_specs = _get_chinese_audio_recognition_metric_specs()
     else:
         metric_specs = _get_audio_recognition_metric_specs()
+    run_spec_name: str = "multilingual_librispeech"
     return RunSpec(
-        name="multilingual_librispeech",
+        name=f"{run_spec_name}:language={language}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["multilingual_librispeech"],
+        groups=[run_spec_name],
     )
 
 
@@ -203,12 +235,13 @@ def get_fleurs_run_spec(language: str) -> RunSpec:
         max_tokens=5,
     )
     metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    run_spec_name: str = "fleurs"
     return RunSpec(
-        name="fleurs",
+        name=f"{run_spec_name}:language={language}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["fleurs"],
+        groups=[run_spec_name],
     )
 
 
@@ -223,12 +256,13 @@ def get_audiocaps_run_spec() -> RunSpec:
         max_tokens=50,
     )
     metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
+    run_spec_name: str = "audiocaps"
     return RunSpec(
-        name="audiocaps",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["audiocaps"],
+        groups=[run_spec_name],
     )
 
 
@@ -248,12 +282,13 @@ def get_common_voice_15_run_spec(language: str) -> RunSpec:
         metric_specs = _get_chinese_audio_recognition_metric_specs()
     else:
         metric_specs = _get_audio_recognition_metric_specs()
+    run_spec_name: str = "common_voice_15"
     return RunSpec(
-        name="common_voice_15",
+        name=f"{run_spec_name}:language={language}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["common_voice_15"],
+        groups=[run_spec_name],
     )
 
 
@@ -269,12 +304,13 @@ def get_speech_robust_bench_run_spec(subject: str) -> RunSpec:
         max_tokens=100,
     )
     metric_specs = _get_audio_recognition_metric_specs()
+    run_spec_name: str = "speech_robust_bench"
     return RunSpec(
-        name="speech_robust_bench",
+        name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["speech_robust_bench"],
+        groups=[run_spec_name],
     )
 
 
@@ -289,10 +325,32 @@ def get_audio_pairs_run_spec(subject: str) -> RunSpec:
         max_tokens=5,
     )
     metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_classification_metric_specs()
+    run_spec_name: str = "audio_pairs"
+    return RunSpec(
+        name=f"{run_spec_name}:subject={subject}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
+@run_spec_function("casual_conversations2")
+def get_casual_conversations2_run_spec(subject: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.casual_conversations2_scenario."
+        "CasualConversations2Scenario",
+        args={"subject": subject},
+    )
+    adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
+        input_noun=None, output_noun="Answer", max_train_instances=0
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "casual_conversations2"
     return RunSpec(
-        name="audio_pairs",
+        name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["audio_pairs"],
+        groups=[run_spec_name],
     )
diff --git a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
@@ -0,0 +1,156 @@
+from typing import List, Optional
+import os
+
+from tqdm import tqdm
+import json
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+
+
+class CasualConversations2Scenario(Scenario):
+    """
+    Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
+    The videos feature paid individuals who agreed to participate in the project and explicitly provided
+    Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
+    themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
+    and Vietnam with a diverse set of adults in various categories.
+
+    The dataset contains the audio, speaker's age, gender information in the following languages:
+    English, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Tamil, Telugu, and Vietnamese.
+
+    Paper: https://arxiv.org/abs/2303.04838
+    Dataset: https://ai.meta.com/datasets/casual-conversations-v2-dataset/
+
+    Requires downloading Causal Conversations V2 from https://ai.meta.com/datasets/casual-conversations-v2-downloads
+
+    Citation:
+    @inproceedings{porgali2023casual,
+    title={The casual conversations v2 dataset},
+    author={Porgali, Bilal and Albiero, V{\'\i}tor and Ryda, Jordan and Ferrer, Cristian Canton and Hazirbas, Caner},
+    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+    pages={10--17},
+    year={2023}
+    }
+    """
+
+    SUBJECTS = ["age", "gender"]
+    SCRIPT_DOWNLOADING_URL = (
+        "https://huggingface.co/datasets/UCSC-VLAA/Causal_Conversation_V2_script/"
+        "resolve/main/CasualConversationsV2_v2.json"
+    )
+    AGE_INSTRUCTION = "Listen to the audio and take your best guess to estimate the speaker's age."
+    GENDER_INSTRUCTION = "Listen to the audio and take your best guess to determine the speaker's gender."
+    name = "casual_conversations2"
+    description = (
+        "A large scale multilingual speech translation corpus "
+        "([Porgali et al., 2023](https://arxiv.org/abs/2303.04838))."
+    )
+    tags = ["audio", "classification", "multilinguality"]
+    gender_options: List[str] = ["male", "female", "transgender male", "transgender female", "non-binary", "other"]
+    age_options: List[str] = ["18-30", "31-50", "51+", "other"]
+
+    def __init__(self, subject: str) -> None:
+        super().__init__()
+
+        if subject not in self.SUBJECTS:
+            raise ValueError(f"Invalid subject. Valid subjects are: {CasualConversations2Scenario.SUBJECTS}")
+
+        self._subject: str = subject
+        self._convert_answer_to_label_func = (
+            self._convert_age_to_label if subject == "age" else self._convert_gender_to_label
+        )
+        self.options = self.age_options if subject == "age" else self.gender_options
+        self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION
+
+    def _extract_audio_from_video(self, input_video_path: str, output_audio_path: str) -> None:
+        try:
+            os.system(f"ffmpeg -i {input_video_path} -q:a 0 -map a {output_audio_path}")
+        except Exception:
+            raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")
+
+    def _convert_age_to_label(self, age: str) -> str:
+        if age != "prefer not to say":
+            age_int = int(age)
+            if 18 <= age_int <= 30:
+                return "A"
+            elif 31 <= age_int <= 50:
+                return "B"
+            elif 51 <= age_int:
+                return "C"
+            else:
+                raise ValueError(f"Invalid age: {age}")
+        else:
+            return "D"
+
+    def _convert_gender_to_label(self, gender: Optional[str]) -> str:
+        if gender is not None and gender != "prefer not to say":
+            if gender == "cis man":
+                return "A"
+            elif gender == "cis woman":
+                return "B"
+            elif gender == "transgender man":
+                return "C"
+            elif gender == "transgender woman":
+                return "D"
+            elif gender == "non-binary":
+                return "E"
+            else:
+                raise ValueError(f"Invalid gender: {gender}")
+        else:
+            return "F"
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_dir: str = os.path.join(output_path, "videos_files")
+        assert os.path.exists(data_dir), (
+            f"Download the video files from Meta's Casual Conversations v2 dataset from "
+            f"(https://ai.meta.com/datasets/casual-conversations-v2-downloads) and unzip and place at {data_dir}."
+        )
+        script_file_path: str = os.path.join(output_path, "CasualConversationsV2.json")
+        audio_file_folder: str = os.path.join(output_path, "audio_files")
+        ensure_directory_exists(audio_file_folder)
+        ensure_file_downloaded(self.SCRIPT_DOWNLOADING_URL, script_file_path)
+        audio_scripts = json.load(open(script_file_path))
+
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+
+        for file_name in tqdm(os.listdir(data_dir)):
+            if file_name.endswith(".mp4"):
+                local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
+                local_video_path: str = os.path.join(data_dir, file_name)
+                if not os.path.exists(local_audio_path):
+                    self._extract_audio_from_video(local_video_path, local_audio_path)
+                assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"
+
+                subject_answer = audio_scripts[file_name][self._subject]
+                answer = self._convert_answer_to_label_func(subject_answer)
+                # The given correct answer is a letter, but we need an index
+                correct_answer_index: int = ord(answer) - ord("A")
+                # The options are originally appended to the question
+
+                references: List[Reference] = []
+                for i, option in enumerate(self.options):
+                    reference: Reference
+                    is_correct: bool = i == correct_answer_index
+                    reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
+                    references.append(reference)
+
+                content = [
+                    MediaObject(content_type="audio/mpeg", location=local_audio_path),
+                    MediaObject(content_type="text/plain", text=self.instruction),
+                ]
+
+                input = Input(multimedia_content=MultimediaObject(content))
+                instances.append(Instance(input=input, references=references, split=split))
+
+        return instances