Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/stanford-crfm/helm into spe…
Browse files Browse the repository at this point in the history
…ech_specs
  • Loading branch information
teetone committed Nov 16, 2024
2 parents 749c49b + 145cbe1 commit eda28a3
Show file tree
Hide file tree
Showing 8 changed files with 269 additions and 26 deletions.
2 changes: 1 addition & 1 deletion helm-frontend/project_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"title": "AIR-Bench",
"description": "Safety benchmark based on emerging government regulations and company policies",
"id": "air-bench",
"releases": ["v1.2.0", "v1.1.0", "v1.0.0"]
"releases": ["v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
},
{
"title": "Safety",
Expand Down
3 changes: 3 additions & 0 deletions src/helm/benchmark/presentation/run_entries_speech.conf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ entries: [
# Fairness
####################################################################################################################

{description: "casual_conversations2:subject=age,model=audiolm", priority: 1}
{description: "casual_conversations2:subject=gender,model=audiolm", priority: 1}

####################################################################################################################
# Robustness
####################################################################################################################
Expand Down
100 changes: 79 additions & 21 deletions src/helm/benchmark/run_specs/audio_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from helm.benchmark.adaptation.adapter_spec import (
AdapterSpec,
)
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION_MULTIMODAL
from helm.benchmark.adaptation.adapters.adapter_factory import (
ADAPT_GENERATION_MULTIMODAL,
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
)
from helm.benchmark.metrics.common_metric_specs import (
get_classification_metric_specs,
get_exact_match_metric_specs,
Expand Down Expand Up @@ -42,6 +45,30 @@ def _get_generation_adapter_spec(
)


def _get_multiple_choice_joint_adapter_spec(
input_noun: Optional[str],
output_noun: str,
max_train_instances: int = 0,
num_outputs: int = 1,
) -> AdapterSpec:
return AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
global_prefix="",
instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
input_prefix=f"{input_noun}: " if input_noun is not None else "",
input_suffix="\n",
output_prefix=f"{output_noun}: ",
output_suffix="\n",
instance_prefix="\n",
max_train_instances=max_train_instances,
num_outputs=num_outputs,
max_tokens=1,
stop_sequences=["\n"],
temperature=0.0,
random=None,
)


########################################################################################################################
# MetricSpecs

Expand Down Expand Up @@ -78,12 +105,13 @@ def get_audio_mnist_run_spec() -> RunSpec:
max_tokens=5,
)
metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
run_spec_name: str = "audio_mnist"
return RunSpec(
name="audio_mnist",
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["audio_mnist"],
groups=[run_spec_name],
)


Expand All @@ -97,12 +125,13 @@ def get_iemocap_audio_run_spec() -> RunSpec:
max_tokens=5,
)
metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
run_spec_name: str = "iemocap_audio"
return RunSpec(
name="iemocap_audio",
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["iemocap_audio"],
groups=[run_spec_name],
)


Expand All @@ -116,12 +145,13 @@ def get_meld_audio_run_spec() -> RunSpec:
max_tokens=5,
)
metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
run_spec_name: str = "meld_audio"
return RunSpec(
name="meld_audio",
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["meld_audio"],
groups=[run_spec_name],
)


Expand Down Expand Up @@ -156,12 +186,13 @@ def get_vocal_sound_run_spec() -> RunSpec:
max_tokens=5,
)
metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
run_spec_name: str = "vocal_sound"
return RunSpec(
name="vocal_sound",
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["vocal_sound"],
groups=[run_spec_name],
)


Expand All @@ -181,12 +212,13 @@ def get_multilingual_librispeech_run_spec(language: str) -> RunSpec:
metric_specs = _get_chinese_audio_recognition_metric_specs()
else:
metric_specs = _get_audio_recognition_metric_specs()
run_spec_name: str = "multilingual_librispeech"
return RunSpec(
name="multilingual_librispeech",
name=f"{run_spec_name}:language={language}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["multilingual_librispeech"],
groups=[run_spec_name],
)


Expand All @@ -203,12 +235,13 @@ def get_fleurs_run_spec(language: str) -> RunSpec:
max_tokens=5,
)
metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
run_spec_name: str = "fleurs"
return RunSpec(
name="fleurs",
name=f"{run_spec_name}:language={language}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["fleurs"],
groups=[run_spec_name],
)


Expand All @@ -223,12 +256,13 @@ def get_audiocaps_run_spec() -> RunSpec:
max_tokens=50,
)
metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
run_spec_name: str = "audiocaps"
return RunSpec(
name="audiocaps",
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["audiocaps"],
groups=[run_spec_name],
)


Expand All @@ -248,12 +282,13 @@ def get_common_voice_15_run_spec(language: str) -> RunSpec:
metric_specs = _get_chinese_audio_recognition_metric_specs()
else:
metric_specs = _get_audio_recognition_metric_specs()
run_spec_name: str = "common_voice_15"
return RunSpec(
name="common_voice_15",
name=f"{run_spec_name}:language={language}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["common_voice_15"],
groups=[run_spec_name],
)


Expand All @@ -269,12 +304,13 @@ def get_speech_robust_bench_run_spec(subject: str) -> RunSpec:
max_tokens=100,
)
metric_specs = _get_audio_recognition_metric_specs()
run_spec_name: str = "speech_robust_bench"
return RunSpec(
name="speech_robust_bench",
name=f"{run_spec_name}:subject={subject}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["speech_robust_bench"],
groups=[run_spec_name],
)


Expand All @@ -289,10 +325,32 @@ def get_audio_pairs_run_spec(subject: str) -> RunSpec:
max_tokens=5,
)
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_classification_metric_specs()
run_spec_name: str = "audio_pairs"
return RunSpec(
name=f"{run_spec_name}:subject={subject}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=[run_spec_name],
)


@run_spec_function("casual_conversations2")
def get_casual_conversations2_run_spec(subject: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.casual_conversations2_scenario."
"CasualConversations2Scenario",
args={"subject": subject},
)
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
input_noun=None, output_noun="Answer", max_train_instances=0
)
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
run_spec_name: str = "casual_conversations2"
return RunSpec(
name="audio_pairs",
name=f"{run_spec_name}:subject={subject}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["audio_pairs"],
groups=[run_spec_name],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from typing import List, Optional
import os

from tqdm import tqdm
import json

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.general import ensure_file_downloaded, ensure_directory_exists


class CasualConversations2Scenario(Scenario):
"""
Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
The videos feature paid individuals who agreed to participate in the project and explicitly provided
Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
and Vietnam with a diverse set of adults in various categories.
The dataset contains the audio, speaker's age, gender information in the following languages:
English, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Tamil, Telugu, and Vietnamese.
Paper: https://arxiv.org/abs/2303.04838
Dataset: https://ai.meta.com/datasets/casual-conversations-v2-dataset/
Requires downloading Causal Conversations V2 from https://ai.meta.com/datasets/casual-conversations-v2-downloads
Citation:
@inproceedings{porgali2023casual,
title={The casual conversations v2 dataset},
author={Porgali, Bilal and Albiero, V{\'\i}tor and Ryda, Jordan and Ferrer, Cristian Canton and Hazirbas, Caner},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={10--17},
year={2023}
}
"""

SUBJECTS = ["age", "gender"]
SCRIPT_DOWNLOADING_URL = (
"https://huggingface.co/datasets/UCSC-VLAA/Causal_Conversation_V2_script/"
"resolve/main/CasualConversationsV2_v2.json"
)
AGE_INSTRUCTION = "Listen to the audio and take your best guess to estimate the speaker's age."
GENDER_INSTRUCTION = "Listen to the audio and take your best guess to determine the speaker's gender."
name = "casual_conversations2"
description = (
"A large scale multilingual speech translation corpus "
"([Porgali et al., 2023](https://arxiv.org/abs/2303.04838))."
)
tags = ["audio", "classification", "multilinguality"]
gender_options: List[str] = ["male", "female", "transgender male", "transgender female", "non-binary", "other"]
age_options: List[str] = ["18-30", "31-50", "51+", "other"]

def __init__(self, subject: str) -> None:
super().__init__()

if subject not in self.SUBJECTS:
raise ValueError(f"Invalid subject. Valid subjects are: {CasualConversations2Scenario.SUBJECTS}")

self._subject: str = subject
self._convert_answer_to_label_func = (
self._convert_age_to_label if subject == "age" else self._convert_gender_to_label
)
self.options = self.age_options if subject == "age" else self.gender_options
self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION

def _extract_audio_from_video(self, input_video_path: str, output_audio_path: str) -> None:
try:
os.system(f"ffmpeg -i {input_video_path} -q:a 0 -map a {output_audio_path}")
except Exception:
raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")

def _convert_age_to_label(self, age: str) -> str:
if age != "prefer not to say":
age_int = int(age)
if 18 <= age_int <= 30:
return "A"
elif 31 <= age_int <= 50:
return "B"
elif 51 <= age_int:
return "C"
else:
raise ValueError(f"Invalid age: {age}")
else:
return "D"

def _convert_gender_to_label(self, gender: Optional[str]) -> str:
if gender is not None and gender != "prefer not to say":
if gender == "cis man":
return "A"
elif gender == "cis woman":
return "B"
elif gender == "transgender man":
return "C"
elif gender == "transgender woman":
return "D"
elif gender == "non-binary":
return "E"
else:
raise ValueError(f"Invalid gender: {gender}")
else:
return "F"

def get_instances(self, output_path: str) -> List[Instance]:
data_dir: str = os.path.join(output_path, "videos_files")
assert os.path.exists(data_dir), (
f"Download the video files from Meta's Casual Conversations v2 dataset from "
f"(https://ai.meta.com/datasets/casual-conversations-v2-downloads) and unzip and place at {data_dir}."
)
script_file_path: str = os.path.join(output_path, "CasualConversationsV2.json")
audio_file_folder: str = os.path.join(output_path, "audio_files")
ensure_directory_exists(audio_file_folder)
ensure_file_downloaded(self.SCRIPT_DOWNLOADING_URL, script_file_path)
audio_scripts = json.load(open(script_file_path))

instances: List[Instance] = []
split: str = TEST_SPLIT

for file_name in tqdm(os.listdir(data_dir)):
if file_name.endswith(".mp4"):
local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
local_video_path: str = os.path.join(data_dir, file_name)
if not os.path.exists(local_audio_path):
self._extract_audio_from_video(local_video_path, local_audio_path)
assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"

subject_answer = audio_scripts[file_name][self._subject]
answer = self._convert_answer_to_label_func(subject_answer)
# The given correct answer is a letter, but we need an index
correct_answer_index: int = ord(answer) - ord("A")
# The options are originally appended to the question

references: List[Reference] = []
for i, option in enumerate(self.options):
reference: Reference
is_correct: bool = i == correct_answer_index
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
references.append(reference)

content = [
MediaObject(content_type="audio/mpeg", location=local_audio_path),
MediaObject(content_type="text/plain", text=self.instruction),
]

input = Input(multimedia_content=MultimediaObject(content))
instances.append(Instance(input=input, references=references, split=split))

return instances
Loading

0 comments on commit eda28a3

Please sign in to comment.