stanford-crfm · yifanmai · Dec 6, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -0,0 +1,93 @@
+import re
+from typing import List, Optional
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+def extract_answer(output_text: str) -> Optional[str]:
+    """
+    Extracts the answer from the output text using two exact regex patterns.
+    Returns None if no valid answer is found.
+
+    Args:
+        output_text (str): The text from which to extract the answer.
+
+    Returns:
+        Optional[str]: The extracted answer (A-J) if found, otherwise None.
+    """
+    # First regex: Matches "answer is (A-J)" with optional parentheses
+    match = re.search(r"answer is \(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
+    match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # If neither regex matches, return None
+    return None
+
+
+class ChainOfThoughtMetric(Metric):
+    """
+    This metric focuses on structured reasoning and the accuracy of extracted answers.
+    It compares model outputs against correct answers provided in a multiple-choice
+    format and returns a score indicating the correctness of the generated response.
+    """
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate the generated output for chain-of-thought reasoning accuracy.
+
+        The method extracts the model's output, determines the correct answer
+        from the provided references, and compares the two to compute a binary score.
+
+        Args:
+            adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
+            request_state (RequestState): The state of the current request, including
+                                          the input instance, output results, and references.
+            metric_service (MetricService): A service used to compute metrics if needed.
+            eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
+
+        Returns:
+            List[Stat]: A list containing a single `Stat` object with the correctness
+                        score (1 for correct, 0 for incorrect) under the metric
+                        name "chain_of_thought_correct".
+        """
+        # Assert that completions exist if the result is not None
+        assert (
+            request_state.result is not None and request_state.result.completions
+        ), "Request state result must have completions."
+
+        # Set output_text if the assertion passes
+        output_text = request_state.result.completions[0].text
+
+        # Extract the answer using the updated logic
+        extracted_answer = extract_answer(output_text)
+
+        # Find the correct answer from references by translating index to letter
+        correct_answer = None
+        for index, option in enumerate(request_state.instance.references):
+            if option.is_correct:
+                correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
+                break
+
+        # Raise an exception if no correct answer is found
+        if correct_answer is None:
+            raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")
+
+        # Compare extracted answer with the correct answer and compute the score
+        score = 1 if extracted_answer == correct_answer else 0
+        return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -21,11 +21,11 @@
     get_generative_harms_metric_specs,
     get_generic_metric_specs,
     get_open_ended_generation_metric_specs,
-    MetricSpec,
 )
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path
 from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
+from helm.benchmark.metrics.metric import MetricSpec
 
 
 @run_spec_function("narrative_qa")
@@ -137,25 +137,59 @@ def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Ru
 
 
 @run_spec_function("mmlu_pro")
-def get_mmlu_pro_spec(subject: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.mmlu_pro.MMLUProScenario", args={"subject": subject}
-    )
+def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    use_few_shot_bool: bool = use_few_shot == "True"
+    del use_chain_of_thought
+    del use_few_shot
 
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
-        input_noun="Question",
-        output_noun="Answer",
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
     )
+    max_train_instance_num = 5 if use_few_shot_bool else 0
 
-    return RunSpec(
-        name=f"mmlu_pro:subject={subject}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=["mmlu_pro"],
-    )
+    if use_chain_of_thought_bool:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=1000,
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=(
+                "Let’s think step by step. Based on your reasoning, what is the single, "
+                "most likely answer choice? Format your response as follows: "
+                '"The correct answer is (insert answer here)".'
+            ),
+        )
+        return RunSpec(
+            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs()
+            + [
+                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+            ],
+            groups=["mmlu_pro"],
+        )
+    else:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=1000,
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
+        )
+        return RunSpec(
+            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs(),
+            groups=["mmlu_pro"],
+        )
 
 
 @run_spec_function("gsm")
@@ -344,79 +378,57 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
     )
     max_train_instance_num = 5 if use_few_shot_bool else 0
 
-    if use_few_shot_bool:
-        if use_chain_of_thought_bool:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_tokens=1000,  # following original repo
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                chain_of_thought_prefix="Let's think step by step: ",
-                chain_of_thought_suffix="The correct answer is ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="",
-                global_suffix=(
-                    "Give step by step reasoning before you answer, and when you’re ready to answer, "
-                    'please use the format "The correct answer is (insert answer here)":'
-                ),
-            )
-        else:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="The correct answer is ",
-            )
+    if use_chain_of_thought_bool:
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            max_tokens=1000,  # following original repo
+            max_train_instances=max_train_instance_num,
+            instructions=(
+                "Here are some example questions from experts. "
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand."
+            ),
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            chain_of_thought_prefix="Let's think step by step: ",
+            chain_of_thought_suffix="The correct answer is ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="",
+        )
+        return RunSpec(
+            name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs()
+            + [
+                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+            ],
+            groups=["gpqa"],
+        )
     else:
-        if use_chain_of_thought_bool:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=(
-                    "Let’s think step by step. Based on your reasoning, what is the single, "
-                    "most likely answer choice? Format your response as follows: "
-                    '"The correct answer is (insert answer here)".'
-                ),
-            )
-        else:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
-            )
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            max_train_instances=max_train_instance_num,
+            instructions=(
+                "Here are some example questions from experts. "
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand."
+            ),
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="The correct answer is ",
+        )
 
-    return RunSpec(
-        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),  # TODO: update this after cot metric is ready
-        groups=["gpqa"],
-    )
+        return RunSpec(
+            name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs(),
+            groups=["gpqa"],
+        )
 
 
 @run_spec_function("ifeval")

diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py → .../benchmark/scenarios/mmlu_pro_scenario.py b/src/helm/benchmark/scenarios/mmlu_pro.py → .../benchmark/scenarios/mmlu_pro_scenario.py
@@ -1,8 +1,17 @@
 from typing import Dict, List
-from datasets import load_dataset
+from datasets import Dataset, load_dataset
 
 from helm.common.hierarchical_logger import hlog
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 
 
 class MMLUProScenario(Scenario):
@@ -33,7 +42,14 @@ def __init__(self, subject: str):
         super().__init__()
         self.subject: str = subject
 
-    def process_csv(self, data, split: str) -> List[Instance]:
+    def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
+        """
+        Process the dataset to create instances.
+
+        :param data: Hugging Face `Dataset` containing the data for a specific split.
+        :param split: The data split (e.g., "train", "test").
+        :return: A list of processed `Instance` objects.
+        """
         instances: List[Instance] = []
         hlog(f"Processing data for {split} split")
         for row in data:
@@ -55,8 +71,14 @@ def answer_to_reference(answer: str) -> Reference:
         return instances
 
     def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Load and process the MMLU-Pro dataset to create instances.
+
+        :param output_path: Path to save or output the processed instances.
+        :return: A list of all processed `Instance` objects.
+        """
         # Load the MMLU-Pro dataset from Hugging Face
-        dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+        dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b")
 
         # Process all the instances
         instances: List[Instance] = []
@@ -66,6 +88,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
         }
         for hf_split, split in splits.items():
             data = dataset[hf_split].filter(lambda x: x["category"] == self.subject)
-            instances.extend(self.process_csv(data, split))
+            instances.extend(self.process_dataset(data, split))
 
         return instances