Adding new file changes

stanford-crfm · Dec 6, 2024 · 616b30e · 616b30e
1 parent c9b2082
commit 616b30e
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 52 deletions.
diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
@@ -179,8 +179,7 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat
         derived_stats: List[Stat] = []
         derived_stats.extend(compute_calibration_metrics(per_instance_stats))
         return derived_stats
-
-
+
 class BasicReferenceMetric(ReferenceMetric):
     """
     Defines basic metrics for Scenarios that use one Request per Reference instead of

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -163,6 +163,16 @@ def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few
                 '"The correct answer is (insert answer here)".'
             ),
         )
+        return RunSpec(
+            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs()
+            + [
+            MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+            ],
+            groups=["mmlu_pro"],
+        )
     else:
         adapter_spec = AdapterSpec(
             method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -174,17 +184,16 @@ def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few
             reference_prefix="(A) ",
             global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
         )
-
-    return RunSpec(
-        name=f"gpqa:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs()
-        + [
+        return RunSpec(
+            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs()
+            + [
             MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
-        ],
-        groups=["mmlu_pro"],
-    )
+            ],
+            groups=["mmlu_pro"],
+        )
 
 
 @run_spec_function("gsm")
@@ -373,8 +382,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
     )
     max_train_instance_num = 5 if use_few_shot_bool else 0
 
-    if use_few_shot_bool:
-        if use_chain_of_thought_bool:
+    if use_chain_of_thought_bool:
             adapter_spec = get_multiple_choice_adapter_spec(
                 method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
                 max_tokens=1000,  # following original repo
@@ -392,7 +400,17 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
                 output_noun="",  # will be overwritten with output_prefix
                 output_prefix="",
             )
-        else:
+            return RunSpec(
+                name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+                scenario_spec=scenario_spec,
+                adapter_spec=adapter_spec,
+                metric_specs=get_exact_match_metric_specs()
+                + [
+                    MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+                ],
+                groups=["gpqa"],
+            )
+    else:
             adapter_spec = get_multiple_choice_adapter_spec(
                 method=ADAPT_MULTIPLE_CHOICE_JOINT,
                 max_train_instances=max_train_instance_num,
@@ -407,44 +425,14 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
                 output_noun="",  # will be overwritten with output_prefix
                 output_prefix="The correct answer is ",
             )
-    else:
-        if use_chain_of_thought_bool:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=(
-                    "Let’s think step by step. Based on your reasoning, what is the single, "
-                    "most likely answer choice? Format your response as follows: "
-                    '"The correct answer is (insert answer here)".'
-                ),
-            )
-        else:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
-            )
 
-    return RunSpec(
-        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs()
-        + [
-            MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
-        ],
-        groups=["gpqa"],
-    )
+            return RunSpec(
+                name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+                scenario_spec=scenario_spec,
+                adapter_spec=adapter_spec,
+                metric_specs=get_exact_match_metric_specs(),
+                groups=["gpqa"],
+            )
 
 
 @run_spec_function("ifeval")

diff --git a/src/helm/benchmark/scenarios/mmlu_scenario_pro.py b/src/helm/benchmark/scenarios/mmlu_scenario_pro.py
@@ -0,0 +1,93 @@
+from typing import Dict, List
+from datasets import Dataset, load_dataset
+
+from helm.common.hierarchical_logger import hlog
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+
+
+class MMLUProScenario(Scenario):
+    """
+    The MMLU-Pro dataset is an advanced version of the Massive Multitask Language Understanding (MMLU)
+    benchmark, created to push the boundaries of language models' reasoning and comprehension skills.
+    Designed as a more challenging evaluation, it increases the answer options per question from four
+    to ten, significantly reducing the likelihood of correct random guesses. This update makes the
+    dataset better at distinguishing the capabilities of models on complex tasks.
+
+    MMLU-Pro emphasizes reasoning over simple factual recall by integrating diverse, intricate questions
+    across 14 domains, including subjects like biology, economics, law, and psychology. In addition, it
+    addresses limitations in the original MMLU by filtering out trivial questions, making it a more
+    robust benchmark. Performance comparisons suggest that models benefit from reasoning-based
+    approaches (such as Chain of Thought, or CoT) on MMLU-Pro, which contrasts with the original
+    MMLU where CoT didn’t show as much benefit. This makes MMLU-Pro especially suitable for evaluating
+    advanced models that rely on nuanced reasoning and comprehension skills.
+
+    Dataset: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
+    Paper: https://arxiv.org/abs/2406.01574
+    """
+
+    name = "mmlu_pro"
+    description = "Enhanced Massive Multitask Language Understanding with increased options and reasoning"
+    tags = ["knowledge", "multiple_choice", "reasoning"]
+
+    def __init__(self, subject: str):
+        super().__init__()
+        self.subject: str = subject
+
+    def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
+        """
+        Process the dataset to create instances.
+
+        :param data: Hugging Face `Dataset` containing the data for a specific split.
+        :param split: The data split (e.g., "train", "test").
+        :return: A list of processed `Instance` objects.
+        """
+        instances: List[Instance] = []
+        hlog(f"Processing data for {split} split")
+        for row in data:
+            question = row["question"]
+            answers = row["options"]
+            correct_choice = row["answer"]
+            answers_dict = dict(zip(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"], answers))
+            correct_answer = answers_dict[correct_choice]
+
+            def answer_to_reference(answer: str) -> Reference:
+                return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+
+            instance = Instance(
+                input=Input(text=question),
+                references=list(map(answer_to_reference, answers)),
+                split=split,
+            )
+            instances.append(instance)
+        return instances
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Load and process the MMLU-Pro dataset to create instances.
+
+        :param output_path: Path to save or output the processed instances.
+        :return: A list of all processed `Instance` objects.
+        """
+        # Load the MMLU-Pro dataset from Hugging Face
+        dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b")
+
+        # Process all the instances
+        instances: List[Instance] = []
+        splits: Dict[str, str] = {
+            "validation": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        for hf_split, split in splits.items():
+            data = dataset[hf_split].filter(lambda x: x["category"] == self.subject)
+            instances.extend(self.process_dataset(data, split))
+
+        return instances