From 6a65e8d554b8a652f7fd5521b9b5b3c4ae80a9a5 Mon Sep 17 00:00:00 2001
From: Leonardo Schettini <lschettini@microsoft.com>
Date: Thu, 19 Dec 2024 17:29:31 +0100
Subject: [PATCH 1/3] feat: vqa rad scenario

---
 src/helm/benchmark/run_specs/vlm_run_specs.py | 162 ++++++++++++++----
 .../vision_language/vqa_rad_scenario.py       |  93 ++++++++++
 2 files changed, 221 insertions(+), 34 deletions(-)
 create mode 100644 src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py

diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py
index 5919166e0c..e0b0bcc06c 100644
--- a/src/helm/benchmark/run_specs/vlm_run_specs.py
+++ b/src/helm/benchmark/run_specs/vlm_run_specs.py
@@ -7,7 +7,9 @@
     ADAPT_GENERATION_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
 )
-from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import DIFFICULTY_ALL
+from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
+    DIFFICULTY_ALL,
+)
 from helm.benchmark.metrics.common_metric_specs import (
     get_exact_match_metric_specs,
     get_generative_harms_metric_specs,
@@ -51,7 +53,9 @@ def _get_generation_adapter_spec(
     )
 
 
-def _get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) -> AdapterSpec:
+def _get_short_answer_generation_adapter_spec(
+    instructions: Optional[str] = None,
+) -> AdapterSpec:
     return _get_generation_adapter_spec(
         instructions=(
             "Just give a short answer without answering in a complete sentence."
@@ -107,7 +111,15 @@ def _get_multiple_choice_joint_adapter_spec(
 
 def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
     return get_basic_metric_specs(
-        ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
+        [
+            "exact_match",
+            "quasi_exact_match",
+            "f1_score",
+            "rouge_l",
+            "bleu_1",
+            "bleu_4",
+            "cider",
+        ]
     )
 
 
@@ -118,7 +130,9 @@ def _get_image2struct_metric_specs(
     include_edit_similarity: bool = True,
     size_handling_method: str = "resize",
 ) -> List[MetricSpec]:
-    from helm.benchmark.metrics.vision_language.image_metrics import AnnotatedImageMetrics
+    from helm.benchmark.metrics.vision_language.image_metrics import (
+        AnnotatedImageMetrics,
+    )
 
     if metric_names is None:
         metric_names = [
@@ -147,7 +161,9 @@ def _get_image2struct_metric_specs(
     return metric_specs + get_basic_metric_specs([])
 
 
-def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
+def _get_prometheus_vision_critique_metric_specs(
+    num_respondents: int, max_tokens: int
+) -> List[MetricSpec]:
     return [
         MetricSpec(
             class_name="helm.benchmark.metrics.prometheus_vision_critique_metrics.PrometheusVisionCritiqueMetric",
@@ -159,7 +175,9 @@ def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_token
     ]
 
 
-def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[MetricSpec]:
+def _get_gpt4v_critique_originality_metric_specs(
+    num_respondents: int,
+) -> List[MetricSpec]:
     return [
         MetricSpec(
             class_name="helm.benchmark.metrics.gpt4v_originality_critique_metrics.GPT4VCritiqueMetric",
@@ -170,7 +188,9 @@ def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[M
     ]
 
 
-def _get_vibe_eval_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
+def _get_vibe_eval_critique_metric_specs(
+    num_respondents: int, max_tokens: int
+) -> List[MetricSpec]:
     return [
         MetricSpec(
             class_name="helm.benchmark.metrics.reka_vibe_critique_metrics.RekaVibeCritiqueMetric",
@@ -233,7 +253,9 @@ def get_chart2csv_spec() -> RunSpec:
 
 
 @run_spec_function("crossmodal_3600")
-def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) -> RunSpec:
+def get_crossmodal_3600_spec(
+    location: str, language: str, num_respondents: int
+) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
         args={"location": location, "language": language},
@@ -264,7 +286,8 @@ def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int)
 @run_spec_function("flickr30k")
 def get_flickr30k_spec(num_respondents: int) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
+        class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario",
+        args={},
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
         instructions="Generate a caption for the following image in plain words. The caption should "
@@ -293,12 +316,15 @@ def get_flickr30k_spec(num_respondents: int) -> RunSpec:
 @run_spec_function("gqa")
 def get_gqa_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
+        class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario",
+        args={},
     )
     adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
         instructions="Answer the question using a single word."
     )
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    metric_specs: List[MetricSpec] = (
+        get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    )
 
     run_spec_name: str = "gqa"
     return RunSpec(
@@ -313,9 +339,12 @@ def get_gqa_spec() -> RunSpec:
 @run_spec_function("hateful_memes")
 def get_hateful_memes_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={}
+        class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario",
+        args={},
+    )
+    adapter_spec = _get_multiple_choice_joint_adapter_spec(
+        input_noun=None, output_noun="Answer", max_train_instances=0
     )
-    adapter_spec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0)
     metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
 
     run_spec_name: str = "hateful_memes"
@@ -368,7 +397,9 @@ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
             "not need to be a complete sentence.",
             max_tokens=20,
         )
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    metric_specs: List[MetricSpec] = (
+        get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    )
 
     run_spec_name: str = "mscoco_captioning"
     if long:
@@ -409,7 +440,8 @@ def get_mscoco_categorization_spec() -> RunSpec:
 @run_spec_function("originality_vlm")
 def get_originality_vlm_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", args={}
+        class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario",
+        args={},
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
     metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
@@ -429,14 +461,17 @@ def get_originality_vlm_spec() -> RunSpec:
 @run_spec_function("viz_wiz")
 def get_viz_wiz_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
+        class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario",
+        args={},
     )
     adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
         # Following https://arxiv.org/abs/2310.03744
         instructions="When the provided information is insufficient, respond with 'Unanswerable'. "
         "Answer the question using a single word or phrase."
     )
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    metric_specs: List[MetricSpec] = (
+        get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    )
 
     run_spec_name: str = "viz_wiz"
     return RunSpec(
@@ -451,14 +486,17 @@ def get_viz_wiz_spec() -> RunSpec:
 @run_spec_function("vqa")
 def get_vqa_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
+        class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario",
+        args={},
     )
     # Following https://arxiv.org/abs/2310.03744
     adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
         instructions='Answer the question using a single word or phrase. When the question asks "How many...", '
         "respond with just a number (e.g., 3) and not the word corresponding to the number."
     )
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    metric_specs: List[MetricSpec] = (
+        get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    )
 
     run_spec_name: str = "vqa"
     return RunSpec(
@@ -472,11 +510,18 @@ def get_vqa_spec() -> RunSpec:
 
 @run_spec_function("image2latex")
 def get_image2latex_spec(
-    subset: str, recompile_prompt: bool = False, difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None
+    subset: str,
+    recompile_prompt: bool = False,
+    difficulty: str = DIFFICULTY_ALL,
+    args: Optional[Dict] = None,
 ) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.image2struct.latex_scenario.LatexScenario",
-        args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
+        args={
+            "subset": subset,
+            "recompile_prompt": recompile_prompt,
+            "difficulty": difficulty,
+        },
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
         instructions="Just give a short answer without answering in a complete sentence.",
@@ -485,7 +530,9 @@ def get_image2latex_spec(
     metric_specs: List[MetricSpec] = _get_image2struct_metric_specs(
         generation_type="latex",
         args=args,
-        include_edit_similarity=("wild" not in subset),  # No ground truth for "wild" subset
+        include_edit_similarity=(
+            "wild" not in subset
+        ),  # No ground truth for "wild" subset
         size_handling_method="padding",
     )
     annotator_specs: List[AnnotatorSpec] = [
@@ -519,7 +566,11 @@ def get_image2webpage_spec(
 ) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.image2struct.webpage_scenario.WebpageScenario",
-        args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
+        args={
+            "subset": subset,
+            "recompile_prompt": recompile_prompt,
+            "difficulty": difficulty,
+        },
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
         instructions="Just give a short answer without answering in a complete sentence.",
@@ -528,7 +579,9 @@ def get_image2webpage_spec(
     metric_specs: List[MetricSpec] = _get_image2struct_metric_specs(
         generation_type="webpage",
         args=args,
-        include_edit_similarity=("wild" not in subset),  # No ground truth for "wild" subset
+        include_edit_similarity=(
+            "wild" not in subset
+        ),  # No ground truth for "wild" subset
         size_handling_method="resize",
     )
     annotator_specs: List[AnnotatorSpec] = [
@@ -542,7 +595,11 @@ def get_image2webpage_spec(
     if "wild" in subset:
         groups = ["image2webpage_wild"]
     else:
-        groups = ["image2webpage", f"image2webpage_{difficulty}", f"image2webpage_{subset}"]
+        groups = [
+            "image2webpage",
+            f"image2webpage_{difficulty}",
+            f"image2webpage_{subset}",
+        ]
     return RunSpec(
         name=run_spec_name,
         scenario_spec=scenario_spec,
@@ -584,7 +641,9 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
 
 
 @run_spec_function("image2musicsheet")
-def get_image2musicsheet_spec(difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None) -> RunSpec:
+def get_image2musicsheet_spec(
+    difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None
+) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.image2struct.musicsheet_scenario.MusicSheetScenario",
         # There os only one subset for music sheets
@@ -675,7 +734,8 @@ def get_unicorn_spec(subject: str) -> RunSpec:
 @run_spec_function("bingo")
 def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
+        class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario",
+        args={"subject": subject},
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
         instructions="Answer the question with a complete and clear explanation in sentences without listing it out.",
@@ -874,8 +934,12 @@ def get_real_world_qa_spec() -> RunSpec:
     )
 
     # Leave the instructions blank because the questions of the dataset already contain the instructions
-    adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(instructions="")
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
+        instructions=""
+    )
+    metric_specs: List[MetricSpec] = (
+        get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    )
 
     run_spec_name: str = "real_world_qa"
     return RunSpec(
@@ -897,7 +961,9 @@ def get_blink_spec(category: str) -> RunSpec:
         instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
         max_tokens=1,
     )
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    metric_specs: List[MetricSpec] = (
+        get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    )
 
     run_spec_name: str = "blink"
     return RunSpec(
@@ -919,7 +985,9 @@ def get_mm_star_spec(category: str) -> RunSpec:
         instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
         max_tokens=1,
     )
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    metric_specs: List[MetricSpec] = (
+        get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    )
 
     run_spec_name: str = "mm_star"
     return RunSpec(
@@ -932,7 +1000,9 @@ def get_mm_star_spec(category: str) -> RunSpec:
 
 
 @run_spec_function("exams_v")
-def get_exams_v_spec(language: str, subject_grouped: str, type: str = "image_text") -> RunSpec:
+def get_exams_v_spec(
+    language: str, subject_grouped: str, type: str = "image_text"
+) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.exams_v_scenario.ExamsVScenario",
         args={"language": language, "subject_grouped": subject_grouped, "type": type},
@@ -964,7 +1034,9 @@ def get_mementos_spec(subject: str, num_respondents: int) -> RunSpec:
     )
     adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
     metric_specs: List[MetricSpec] = (
-        _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
+        _get_prometheus_vision_critique_metric_specs(
+            num_respondents=num_respondents, max_tokens=200
+        )
         + _get_open_ended_generation_metric_specs()
     )
 
@@ -986,7 +1058,9 @@ def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec:
     )
     adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
     metric_specs: List[MetricSpec] = (
-        _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
+        _get_prometheus_vision_critique_metric_specs(
+            num_respondents=num_respondents, max_tokens=200
+        )
         + _get_open_ended_generation_metric_specs()
     )
 
@@ -998,3 +1072,23 @@ def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec:
         metric_specs=metric_specs,
         groups=[run_spec_name],
     )
+
+
+@run_spec_function("vqa_rad")
+def get_vqa_rad_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.vqa_rad_scenario.VQARadScenario",
+    )
+    adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
+        instructions="Answer the question using a single word or sentence."
+    )
+    metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
+
+    run_spec_name: str = "vqa_rad"
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
diff --git a/src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py b/src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py
new file mode 100644
index 0000000000..94d4c1231b
--- /dev/null
+++ b/src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py
@@ -0,0 +1,93 @@
+import os
+from typing import List
+
+from datasets import DatasetDict, load_dataset
+from PIL import Image
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.general import ensure_directory_exists
+from helm.common.media_object import MediaObject, MultimediaObject
+
+
+class VQARadScenario(Scenario):
+    """
+    VQARad scenario: Processes a visual question answering dataset with radiology images.
+
+    Each record in the dataset has:
+    - image
+    - question
+    - answer
+
+    The output is formatted as:
+    "Answer: <answer>"
+    """
+
+    HUGGING_FACE_DATASET_PATH: str = "flaviagiammarino/vqa-rad"
+
+    name = "vqa_rad"
+    description = "Visual question answering with radiology images."
+    tags = [
+        "vision-language",
+        "visual question answering",
+        "reasoning",
+        "medical",
+        "radiology",
+    ]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH)
+
+        splits = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
+        instances: List[Instance] = []
+        # Iterate over the splits
+        for (
+            helm_split_name,
+            dataset_split_name,
+        ) in splits.items():
+            split_path: str = os.path.join(output_path, dataset_split_name)
+            ensure_directory_exists(split_path)
+
+            split_data = dataset[dataset_split_name]
+
+            for index, example in enumerate(split_data):
+                question = example["question"]
+                image = example["image"]
+                answer = example["answer"]
+
+                # Convert PIL image to MediaObject
+                image_path = os.path.join(split_path, f"{index}.jpg")
+                image.save(image_path)
+
+                content = [
+                    MediaObject(location=image_path, content_type="image/jpeg"),
+                    MediaObject(text=question, content_type="text/plain"),
+                ]
+
+                # Format the final answer
+                instances.append(
+                    Instance(
+                        input=Input(multimedia_content=MultimediaObject(content)),
+                        references=[
+                            Reference(
+                                Output(text=answer),
+                                tags=[CORRECT_TAG],
+                            )
+                        ],
+                        split=helm_split_name,
+                        extra_data={
+                            "id": index,
+                            "image_path": image_path,
+                        },
+                    )
+                )
+
+        return instances

From aabe0feb86b7ecb5c8fa4e9fce83d2c056e79669 Mon Sep 17 00:00:00 2001
From: Leonardo Schettini <lschettini@microsoft.com>
Date: Tue, 24 Dec 2024 14:53:59 +0100
Subject: [PATCH 2/3] improve(vlm): add max tokens to short answer

---
 src/helm/benchmark/run_specs/vlm_run_specs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py
index e0b0bcc06c..0b3d58db8d 100644
--- a/src/helm/benchmark/run_specs/vlm_run_specs.py
+++ b/src/helm/benchmark/run_specs/vlm_run_specs.py
@@ -55,6 +55,7 @@ def _get_generation_adapter_spec(
 
 def _get_short_answer_generation_adapter_spec(
     instructions: Optional[str] = None,
+    max_tokens: Optional[int] = None,
 ) -> AdapterSpec:
     return _get_generation_adapter_spec(
         instructions=(
@@ -62,7 +63,7 @@ def _get_short_answer_generation_adapter_spec(
             if instructions is None
             else instructions
         ),
-        max_tokens=20,
+        max_tokens=20 if max_tokens is None else max_tokens,
     )
 
 

From 1c7e6bf356c46b16760f7ec8b5c69fa43c4574e4 Mon Sep 17 00:00:00 2001
From: Leonardo Schettini <lschettini@microsoft.com>
Date: Tue, 24 Dec 2024 14:54:52 +0100
Subject: [PATCH 3/3] docs: document how to run sqlite with cloud computes

---
 docs/tutorial.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/tutorial.md b/docs/tutorial.md
index 77620560e7..164b612bf8 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -27,6 +27,10 @@ The meaning of the arguments are as follows:
 -  The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory.
 -  The output directory is `benchmark_output/` by default and can be set using `--output-path`.
 
+> 📘 Good to know
+>
+> When running the command on cloud computes, sqlite may throw an `OperationalError: database is locked` because the environment directory is **network mounted**. To workaround the issue, set `--local-path` to a local path (typically `/tmp` and `/mnt`).
+
 After running this command, navigate to the `benchmark_output/runs/my-suite/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run entries we used earlier, but with `/` replaced with `_`.
 
 Each output sub-directory will contain several JSON files that were generated during the corresponding run: