From 6a65e8d554b8a652f7fd5521b9b5b3c4ae80a9a5 Mon Sep 17 00:00:00 2001 From: Leonardo Schettini Date: Thu, 19 Dec 2024 17:29:31 +0100 Subject: [PATCH 1/3] feat: vqa rad scenario --- src/helm/benchmark/run_specs/vlm_run_specs.py | 162 ++++++++++++++---- .../vision_language/vqa_rad_scenario.py | 93 ++++++++++ 2 files changed, 221 insertions(+), 34 deletions(-) create mode 100644 src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py index 5919166e0c..e0b0bcc06c 100644 --- a/src/helm/benchmark/run_specs/vlm_run_specs.py +++ b/src/helm/benchmark/run_specs/vlm_run_specs.py @@ -7,7 +7,9 @@ ADAPT_GENERATION_MULTIMODAL, ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL, ) -from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import DIFFICULTY_ALL +from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import ( + DIFFICULTY_ALL, +) from helm.benchmark.metrics.common_metric_specs import ( get_exact_match_metric_specs, get_generative_harms_metric_specs, @@ -51,7 +53,9 @@ def _get_generation_adapter_spec( ) -def _get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) -> AdapterSpec: +def _get_short_answer_generation_adapter_spec( + instructions: Optional[str] = None, +) -> AdapterSpec: return _get_generation_adapter_spec( instructions=( "Just give a short answer without answering in a complete sentence." @@ -107,7 +111,15 @@ def _get_multiple_choice_joint_adapter_spec( def _get_open_ended_generation_metric_specs() -> List[MetricSpec]: return get_basic_metric_specs( - ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"] + [ + "exact_match", + "quasi_exact_match", + "f1_score", + "rouge_l", + "bleu_1", + "bleu_4", + "cider", + ] ) @@ -118,7 +130,9 @@ def _get_image2struct_metric_specs( include_edit_similarity: bool = True, size_handling_method: str = "resize", ) -> List[MetricSpec]: - from helm.benchmark.metrics.vision_language.image_metrics import AnnotatedImageMetrics + from helm.benchmark.metrics.vision_language.image_metrics import ( + AnnotatedImageMetrics, + ) if metric_names is None: metric_names = [ @@ -147,7 +161,9 @@ def _get_image2struct_metric_specs( return metric_specs + get_basic_metric_specs([]) -def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]: +def _get_prometheus_vision_critique_metric_specs( + num_respondents: int, max_tokens: int +) -> List[MetricSpec]: return [ MetricSpec( class_name="helm.benchmark.metrics.prometheus_vision_critique_metrics.PrometheusVisionCritiqueMetric", @@ -159,7 +175,9 @@ def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_token ] -def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[MetricSpec]: +def _get_gpt4v_critique_originality_metric_specs( + num_respondents: int, +) -> List[MetricSpec]: return [ MetricSpec( class_name="helm.benchmark.metrics.gpt4v_originality_critique_metrics.GPT4VCritiqueMetric", @@ -170,7 +188,9 @@ def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[M ] -def _get_vibe_eval_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]: +def _get_vibe_eval_critique_metric_specs( + num_respondents: int, max_tokens: int +) -> List[MetricSpec]: return [ MetricSpec( class_name="helm.benchmark.metrics.reka_vibe_critique_metrics.RekaVibeCritiqueMetric", @@ -233,7 +253,9 @@ def get_chart2csv_spec() -> RunSpec: @run_spec_function("crossmodal_3600") -def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) -> RunSpec: +def get_crossmodal_3600_spec( + location: str, language: str, num_respondents: int +) -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario", args={"location": location, "language": language}, @@ -264,7 +286,8 @@ def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) @run_spec_function("flickr30k") def get_flickr30k_spec(num_respondents: int) -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={} + class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", + args={}, ) adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Generate a caption for the following image in plain words. The caption should " @@ -293,12 +316,15 @@ def get_flickr30k_spec(num_respondents: int) -> RunSpec: @run_spec_function("gqa") def get_gqa_spec() -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={} + class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", + args={}, ) adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( instructions="Answer the question using a single word." ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = ( + get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + ) run_spec_name: str = "gqa" return RunSpec( @@ -313,9 +339,12 @@ def get_gqa_spec() -> RunSpec: @run_spec_function("hateful_memes") def get_hateful_memes_spec() -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={} + class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", + args={}, + ) + adapter_spec = _get_multiple_choice_joint_adapter_spec( + input_noun=None, output_noun="Answer", max_train_instances=0 ) - adapter_spec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() run_spec_name: str = "hateful_memes" @@ -368,7 +397,9 @@ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec: "not need to be a complete sentence.", max_tokens=20, ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = ( + get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + ) run_spec_name: str = "mscoco_captioning" if long: @@ -409,7 +440,8 @@ def get_mscoco_categorization_spec() -> RunSpec: @run_spec_function("originality_vlm") def get_originality_vlm_spec() -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", args={} + class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", + args={}, ) adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500) metric_specs: List[MetricSpec] = get_generative_harms_metric_specs( @@ -429,14 +461,17 @@ def get_originality_vlm_spec() -> RunSpec: @run_spec_function("viz_wiz") def get_viz_wiz_spec() -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={} + class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", + args={}, ) adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( # Following https://arxiv.org/abs/2310.03744 instructions="When the provided information is insufficient, respond with 'Unanswerable'. " "Answer the question using a single word or phrase." ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = ( + get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + ) run_spec_name: str = "viz_wiz" return RunSpec( @@ -451,14 +486,17 @@ def get_viz_wiz_spec() -> RunSpec: @run_spec_function("vqa") def get_vqa_spec() -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={} + class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", + args={}, ) # Following https://arxiv.org/abs/2310.03744 adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( instructions='Answer the question using a single word or phrase. When the question asks "How many...", ' "respond with just a number (e.g., 3) and not the word corresponding to the number." ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = ( + get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + ) run_spec_name: str = "vqa" return RunSpec( @@ -472,11 +510,18 @@ def get_vqa_spec() -> RunSpec: @run_spec_function("image2latex") def get_image2latex_spec( - subset: str, recompile_prompt: bool = False, difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None + subset: str, + recompile_prompt: bool = False, + difficulty: str = DIFFICULTY_ALL, + args: Optional[Dict] = None, ) -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.image2struct.latex_scenario.LatexScenario", - args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty}, + args={ + "subset": subset, + "recompile_prompt": recompile_prompt, + "difficulty": difficulty, + }, ) adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Just give a short answer without answering in a complete sentence.", @@ -485,7 +530,9 @@ def get_image2latex_spec( metric_specs: List[MetricSpec] = _get_image2struct_metric_specs( generation_type="latex", args=args, - include_edit_similarity=("wild" not in subset), # No ground truth for "wild" subset + include_edit_similarity=( + "wild" not in subset + ), # No ground truth for "wild" subset size_handling_method="padding", ) annotator_specs: List[AnnotatorSpec] = [ @@ -519,7 +566,11 @@ def get_image2webpage_spec( ) -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.image2struct.webpage_scenario.WebpageScenario", - args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty}, + args={ + "subset": subset, + "recompile_prompt": recompile_prompt, + "difficulty": difficulty, + }, ) adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Just give a short answer without answering in a complete sentence.", @@ -528,7 +579,9 @@ def get_image2webpage_spec( metric_specs: List[MetricSpec] = _get_image2struct_metric_specs( generation_type="webpage", args=args, - include_edit_similarity=("wild" not in subset), # No ground truth for "wild" subset + include_edit_similarity=( + "wild" not in subset + ), # No ground truth for "wild" subset size_handling_method="resize", ) annotator_specs: List[AnnotatorSpec] = [ @@ -542,7 +595,11 @@ def get_image2webpage_spec( if "wild" in subset: groups = ["image2webpage_wild"] else: - groups = ["image2webpage", f"image2webpage_{difficulty}", f"image2webpage_{subset}"] + groups = [ + "image2webpage", + f"image2webpage_{difficulty}", + f"image2webpage_{subset}", + ] return RunSpec( name=run_spec_name, scenario_spec=scenario_spec, @@ -584,7 +641,9 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec: @run_spec_function("image2musicsheet") -def get_image2musicsheet_spec(difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None) -> RunSpec: +def get_image2musicsheet_spec( + difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None +) -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.image2struct.musicsheet_scenario.MusicSheetScenario", # There os only one subset for music sheets @@ -675,7 +734,8 @@ def get_unicorn_spec(subject: str) -> RunSpec: @run_spec_function("bingo") def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject} + class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", + args={"subject": subject}, ) adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Answer the question with a complete and clear explanation in sentences without listing it out.", @@ -874,8 +934,12 @@ def get_real_world_qa_spec() -> RunSpec: ) # Leave the instructions blank because the questions of the dataset already contain the instructions - adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(instructions="") - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( + instructions="" + ) + metric_specs: List[MetricSpec] = ( + get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + ) run_spec_name: str = "real_world_qa" return RunSpec( @@ -897,7 +961,9 @@ def get_blink_spec(category: str) -> RunSpec: instructions="Answer the multiple choice question by just giving the letter of the correct answer.", max_tokens=1, ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = ( + get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + ) run_spec_name: str = "blink" return RunSpec( @@ -919,7 +985,9 @@ def get_mm_star_spec(category: str) -> RunSpec: instructions="Answer the multiple choice question by just giving the letter of the correct answer.", max_tokens=1, ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = ( + get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + ) run_spec_name: str = "mm_star" return RunSpec( @@ -932,7 +1000,9 @@ def get_mm_star_spec(category: str) -> RunSpec: @run_spec_function("exams_v") -def get_exams_v_spec(language: str, subject_grouped: str, type: str = "image_text") -> RunSpec: +def get_exams_v_spec( + language: str, subject_grouped: str, type: str = "image_text" +) -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.exams_v_scenario.ExamsVScenario", args={"language": language, "subject_grouped": subject_grouped, "type": type}, @@ -964,7 +1034,9 @@ def get_mementos_spec(subject: str, num_respondents: int) -> RunSpec: ) adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec() metric_specs: List[MetricSpec] = ( - _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200) + _get_prometheus_vision_critique_metric_specs( + num_respondents=num_respondents, max_tokens=200 + ) + _get_open_ended_generation_metric_specs() ) @@ -986,7 +1058,9 @@ def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec: ) adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec() metric_specs: List[MetricSpec] = ( - _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200) + _get_prometheus_vision_critique_metric_specs( + num_respondents=num_respondents, max_tokens=200 + ) + _get_open_ended_generation_metric_specs() ) @@ -998,3 +1072,23 @@ def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec: metric_specs=metric_specs, groups=[run_spec_name], ) + + +@run_spec_function("vqa_rad") +def get_vqa_rad_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.vision_language.vqa_rad_scenario.VQARadScenario", + ) + adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( + instructions="Answer the question using a single word or sentence." + ) + metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs() + + run_spec_name: str = "vqa_rad" + return RunSpec( + name=run_spec_name, + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=[run_spec_name], + ) diff --git a/src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py b/src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py new file mode 100644 index 0000000000..94d4c1231b --- /dev/null +++ b/src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py @@ -0,0 +1,93 @@ +import os +from typing import List + +from datasets import DatasetDict, load_dataset +from PIL import Image + +from helm.benchmark.scenarios.scenario import ( + CORRECT_TAG, + TEST_SPLIT, + TRAIN_SPLIT, + Input, + Instance, + Output, + Reference, + Scenario, +) +from helm.common.general import ensure_directory_exists +from helm.common.media_object import MediaObject, MultimediaObject + + +class VQARadScenario(Scenario): + """ + VQARad scenario: Processes a visual question answering dataset with radiology images. + + Each record in the dataset has: + - image + - question + - answer + + The output is formatted as: + "Answer: " + """ + + HUGGING_FACE_DATASET_PATH: str = "flaviagiammarino/vqa-rad" + + name = "vqa_rad" + description = "Visual question answering with radiology images." + tags = [ + "vision-language", + "visual question answering", + "reasoning", + "medical", + "radiology", + ] + + def get_instances(self, output_path: str) -> List[Instance]: + dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH) + + splits = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} + instances: List[Instance] = [] + # Iterate over the splits + for ( + helm_split_name, + dataset_split_name, + ) in splits.items(): + split_path: str = os.path.join(output_path, dataset_split_name) + ensure_directory_exists(split_path) + + split_data = dataset[dataset_split_name] + + for index, example in enumerate(split_data): + question = example["question"] + image = example["image"] + answer = example["answer"] + + # Convert PIL image to MediaObject + image_path = os.path.join(split_path, f"{index}.jpg") + image.save(image_path) + + content = [ + MediaObject(location=image_path, content_type="image/jpeg"), + MediaObject(text=question, content_type="text/plain"), + ] + + # Format the final answer + instances.append( + Instance( + input=Input(multimedia_content=MultimediaObject(content)), + references=[ + Reference( + Output(text=answer), + tags=[CORRECT_TAG], + ) + ], + split=helm_split_name, + extra_data={ + "id": index, + "image_path": image_path, + }, + ) + ) + + return instances From aabe0feb86b7ecb5c8fa4e9fce83d2c056e79669 Mon Sep 17 00:00:00 2001 From: Leonardo Schettini Date: Tue, 24 Dec 2024 14:53:59 +0100 Subject: [PATCH 2/3] improve(vlm): add max tokens to short answer --- src/helm/benchmark/run_specs/vlm_run_specs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py index e0b0bcc06c..0b3d58db8d 100644 --- a/src/helm/benchmark/run_specs/vlm_run_specs.py +++ b/src/helm/benchmark/run_specs/vlm_run_specs.py @@ -55,6 +55,7 @@ def _get_generation_adapter_spec( def _get_short_answer_generation_adapter_spec( instructions: Optional[str] = None, + max_tokens: Optional[int] = None, ) -> AdapterSpec: return _get_generation_adapter_spec( instructions=( @@ -62,7 +63,7 @@ def _get_short_answer_generation_adapter_spec( if instructions is None else instructions ), - max_tokens=20, + max_tokens=20 if max_tokens is None else max_tokens, ) From 1c7e6bf356c46b16760f7ec8b5c69fa43c4574e4 Mon Sep 17 00:00:00 2001 From: Leonardo Schettini Date: Tue, 24 Dec 2024 14:54:52 +0100 Subject: [PATCH 3/3] docs: document how to run sqlite with cloud computes --- docs/tutorial.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/tutorial.md b/docs/tutorial.md index 77620560e7..164b612bf8 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -27,6 +27,10 @@ The meaning of the arguments are as follows: - The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory. - The output directory is `benchmark_output/` by default and can be set using `--output-path`. +> 📘 Good to know +> +> When running the command on cloud computes, sqlite may throw an `OperationalError: database is locked` because the environment directory is **network mounted**. To workaround the issue, set `--local-path` to a local path (typically `/tmp` and `/mnt`). + After running this command, navigate to the `benchmark_output/runs/my-suite/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run entries we used earlier, but with `/` replaced with `_`. Each output sub-directory will contain several JSON files that were generated during the corresponding run: