From fad62fd07facb785e0300cea841ca332e6b6ed8e Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Mon, 11 Nov 2024 15:17:00 -0800
Subject: [PATCH 01/18] Committing changes for COT metric

---
 .../metrics/air_bench_metrics copy.py         | 56 ++++++++++++
 src/helm/benchmark/metrics/basic_metrics.py   |  2 +-
 .../metrics/chain_of_thought_metric.py        | 61 +++++++++++++
 .../benchmark/run_specs/lite_run_specs.py     | 90 ++++++++++++++++++-
 4 files changed, 207 insertions(+), 2 deletions(-)
 create mode 100644 src/helm/benchmark/metrics/air_bench_metrics copy.py
 create mode 100644 src/helm/benchmark/metrics/chain_of_thought_metric.py

diff --git a/src/helm/benchmark/metrics/air_bench_metrics copy.py b/src/helm/benchmark/metrics/air_bench_metrics copy.py
new file mode 100644
index 0000000000..97401b9978
--- /dev/null
+++ b/src/helm/benchmark/metrics/air_bench_metrics copy.py	
@@ -0,0 +1,56 @@
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
+from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+class AIRBench2024BasicGenerationMetric(Metric):
+    """Replacement for BasicGenerationMetric for AIRBench 2024.
+
+    We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
+    because we abuse "references" to store metadata rather than true metadata."""
+
+    def __init__(self):
+        super().__init__()
+        self.efficiency_metric = EfficiencyMetric()
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
+
+
+class AIRBench2024ScoreMetric(Metric):
+    """Score metrics for AIRBench 2024."""
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert len(request_state.instance.references) > 1
+        category_text = request_state.instance.references[0].output.text
+        category_parts = category_text.split(".")
+        assert len(category_parts) == 3
+        assert request_state.annotations
+        score = request_state.annotations["air_bench_2024"]["score"]
+        return [
+            Stat(MetricName("air_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add(
+                score
+            ),
+        ]
diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
index 48c933f076..0e11ac24dd 100644
--- a/src/helm/benchmark/metrics/basic_metrics.py
+++ b/src/helm/benchmark/metrics/basic_metrics.py
@@ -181,7 +181,7 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat
         return derived_stats
 
 
-class BasicReferenceMetric(ReferenceMetric):
+class BasicGenerationMetric(ReferenceMetric):
     """
     Defines basic metrics for Scenarios that use one Request per Reference instead of
     one per Instance.
diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
new file mode 100644
index 0000000000..6e6830d9c7
--- /dev/null
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -0,0 +1,61 @@
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
+from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+import re
+import random
+from typing import List
+
+
+
+class ChainOfThoughtMetric(Metric):
+    """Replacement for BasicGenerationMetric for AIRBench 2024.
+
+    We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
+    because we abuse "references" to store metadata rather than true metadata."""
+
+
+   def evaluate_generation(
+    self,
+    adapter_spec: AdapterSpec,
+    request_state: RequestState,
+    metric_service: MetricService,
+    eval_cache_path: str,
+) -> List[Stat]:
+    # Output from the model
+    output_text = request_state.result.completions[0].text
+    
+    # Initial regex pattern to match answer
+    match = re.search(r'answer is \(?([A-J])\)?', output_text)
+    
+    # Secondary regex pattern if the initial one fails
+    if not match:
+        match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text)
+    
+    # Fallback mechanism
+    if match:
+        extracted_answer = match.group(1)
+    else:
+        extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
+    
+    # Find the correct answer from references
+    correct_answer = None
+    for option in request_state.instance.references:
+        if option.get("is_correct"):
+            correct_answer = option.get("label")  # Assuming 'label' holds the answer letter, e.g., "A", "B", etc.
+            break
+
+    # Return the score in the specified format
+    score = 1 if extracted_answer == correct_answer else 0
+    return [
+        Stat(MetricName("chain_of_thought_correct")).add(score)
+    ]
+
+
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 99b111c804..49b905e9f5 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -25,6 +25,7 @@
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path
 from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
+from helm.benchmark.metrics.metric import MetricSpec;
 
 
 @run_spec_function("narrative_qa")
@@ -413,6 +414,93 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
         name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),  # TODO: update this after cot metric is ready
+        metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),],  # TODO: update this after cot metric is ready
+        groups=["gpqa"],
+    )
+
+@run_spec_function("gpqa")
+def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    use_few_shot_bool: bool = use_few_shot == "True"
+    del use_chain_of_thought
+    del use_few_shot
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset}
+    )
+    max_train_instance_num = 5 if use_few_shot_bool else 0
+
+    if use_few_shot_bool:
+        if use_chain_of_thought_bool:
+            adapter_spec = get_multiple_choice_adapter_spec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+                max_tokens=1000,  # following original repo
+                max_train_instances=max_train_instance_num,
+                instructions=(
+                    "Here are some example questions from experts. "
+                    "An explanation is given before the final answer. "
+                    "Answer the final question yourself, giving your reasoning beforehand."
+                ),
+                input_noun="Question",
+                input_suffix="\nChoices: \n",
+                reference_prefix="(A) ",
+                chain_of_thought_prefix="Let's think step by step: ",
+                chain_of_thought_suffix="The correct answer is ",
+                output_noun="",  # will be overwritten with output_prefix
+                output_prefix="",
+                global_suffix=(
+                    "Give step by step reasoning before you answer, and when you’re ready to answer, "
+                    'please use the format "The correct answer is (insert answer here)":'
+                ),
+            )
+        else:
+            adapter_spec = get_multiple_choice_adapter_spec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT,
+                max_train_instances=max_train_instance_num,
+                instructions=(
+                    "Here are some example questions from experts. "
+                    "An explanation is given before the final answer. "
+                    "Answer the final question yourself, giving your reasoning beforehand."
+                ),
+                input_noun="Question",
+                input_suffix="\nChoices: \n",
+                reference_prefix="(A) ",
+                output_noun="",  # will be overwritten with output_prefix
+                output_prefix="The correct answer is ",
+            )
+    else:
+        if use_chain_of_thought_bool:
+            adapter_spec = AdapterSpec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+                max_train_instances=max_train_instance_num,
+                max_tokens=1000,
+                input_prefix="What is the correct answer to this question: ",
+                input_suffix="\nChoices:\n",
+                output_prefix="",
+                reference_prefix="(A) ",
+                global_suffix=(
+                    "Let’s think step by step. Based on your reasoning, what is the single, "
+                    "most likely answer choice? Format your response as follows: "
+                    '"The correct answer is (insert answer here)".'
+                ),
+            )
+        else:
+            adapter_spec = AdapterSpec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT,
+                max_train_instances=max_train_instance_num,
+                max_tokens=1000,
+                input_prefix="What is the correct answer to this question: ",
+                input_suffix="\nChoices:\n",
+                output_prefix="",
+                reference_prefix="(A) ",
+                global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
+            )
+
+    return RunSpec(
+        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),],  # TODO: update this after cot metric is ready
         groups=["gpqa"],
     )

From 89460ec06d0f847cb731a045b7d3eecd6c1671a1 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Mon, 11 Nov 2024 15:53:39 -0800
Subject: [PATCH 02/18] Changes for COT metrix

---
 src/helm/benchmark/metrics/basic_metrics.py   |  5 +-
 .../metrics/chain_of_thought_metric.py        | 72 ++++++++++---------
 .../benchmark/run_specs/lite_run_specs.py     |  5 +-
 src/helm/benchmark/static/schema_lite_v2.yaml |  8 ++-
 4 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
index 0e11ac24dd..6031be068e 100644
--- a/src/helm/benchmark/metrics/basic_metrics.py
+++ b/src/helm/benchmark/metrics/basic_metrics.py
@@ -179,9 +179,8 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat
         derived_stats: List[Stat] = []
         derived_stats.extend(compute_calibration_metrics(per_instance_stats))
         return derived_stats
-
-
-class BasicGenerationMetric(ReferenceMetric):
+    
+class BasicReferenceMetric(ReferenceMetric):
     """
     Defines basic metrics for Scenarios that use one Request per Reference instead of
     one per Instance.
diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index 6e6830d9c7..fe8c5c982f 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -22,40 +22,42 @@ class ChainOfThoughtMetric(Metric):
     because we abuse "references" to store metadata rather than true metadata."""
 
 
-   def evaluate_generation(
-    self,
-    adapter_spec: AdapterSpec,
-    request_state: RequestState,
-    metric_service: MetricService,
-    eval_cache_path: str,
-) -> List[Stat]:
-    # Output from the model
-    output_text = request_state.result.completions[0].text
-    
-    # Initial regex pattern to match answer
-    match = re.search(r'answer is \(?([A-J])\)?', output_text)
-    
-    # Secondary regex pattern if the initial one fails
-    if not match:
-        match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text)
-    
-    # Fallback mechanism
-    if match:
-        extracted_answer = match.group(1)
-    else:
-        extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
-    
-    # Find the correct answer from references
-    correct_answer = None
-    for option in request_state.instance.references:
-        if option.get("is_correct"):
-            correct_answer = option.get("label")  # Assuming 'label' holds the answer letter, e.g., "A", "B", etc.
-            break
-
-    # Return the score in the specified format
-    score = 1 if extracted_answer == correct_answer else 0
-    return [
-        Stat(MetricName("chain_of_thought_correct")).add(score)
-    ]
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        # Output from the model
+        output_text = request_state.result.completions[0].text
+        
+        # Initial regex pattern to match answer
+        match = re.search(r'answer is \(?([A-J])\)?', output_text)
+        
+        # Secondary regex pattern if the initial one fails
+        if not match:
+            match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text)
+        
+        # Fallback mechanism
+        if match:
+            extracted_answer = match.group(1)
+        else:
+            extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
+        
+        # Find the correct answer from references
+        correct_answer = None
+
+        #option is an object with attributes
+        for option in request_state.instance.references:
+            if option.is_correct:
+                correct_answer = option  # Assuming 'label' holds the answer letter, e.g., "A", "B", etc.
+                break
+
+        # Return the score in the specified format
+        score = 1 if extracted_answer == correct_answer else 0
+        return [
+            Stat(MetricName("chain_of_thought_correct"), score)
+        ]
 
 
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 49b905e9f5..96ed4890cc 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -330,8 +330,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         groups=["wmt_14"],
     )
 
-
-@run_spec_function("gpqa")
+""" @run_spec_function("gpqa")
 def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
     # Convert to bools and remove the str versions
     use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
@@ -416,7 +415,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),],  # TODO: update this after cot metric is ready
         groups=["gpqa"],
-    )
+    )"""
 
 @run_spec_function("gpqa")
 def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml
index 3f55d69cd2..d5f06655f9 100644
--- a/src/helm/benchmark/static/schema_lite_v2.yaml
+++ b/src/helm/benchmark/static/schema_lite_v2.yaml
@@ -88,6 +88,12 @@ metrics:
     short_display_name: PEM
     description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
     lower_is_better: false
+  - name: chain_of_thought_correct
+    # TODO: should call this prefix_quasi_exact_match
+    display_name: COT correct
+    short_display_name: COT correct
+    description: TBD.
+    lower_is_better: false
 
 ############################################################
 perturbations: []
@@ -156,7 +162,7 @@ run_groups:
       - efficiency
       - general_information
     environment:
-      main_name: exact_match  # non-CoT
+      main_name: chain_of_thought_correct  # non-CoT
       main_split: test
     taxonomy:
       task: "?"

From a366e24a1451762579331f0d826c0b051f3e04ad Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Mon, 11 Nov 2024 15:54:25 -0800
Subject: [PATCH 03/18] Changes to COT metric

---
 src/helm/benchmark/metrics/basic_metrics.py   |  3 ++-
 .../metrics/chain_of_thought_metric.py        | 24 +++++++------------
 .../benchmark/run_specs/lite_run_specs.py     |  9 +++++--
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
index 6031be068e..48c933f076 100644
--- a/src/helm/benchmark/metrics/basic_metrics.py
+++ b/src/helm/benchmark/metrics/basic_metrics.py
@@ -179,7 +179,8 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat
         derived_stats: List[Stat] = []
         derived_stats.extend(compute_calibration_metrics(per_instance_stats))
         return derived_stats
-    
+
+
 class BasicReferenceMetric(ReferenceMetric):
     """
     Defines basic metrics for Scenarios that use one Request per Reference instead of
diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index fe8c5c982f..aa9a6ebd6f 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -14,14 +14,12 @@
 from typing import List
 
 
-
 class ChainOfThoughtMetric(Metric):
     """Replacement for BasicGenerationMetric for AIRBench 2024.
 
     We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
     because we abuse "references" to store metadata rather than true metadata."""
 
-
     def evaluate_generation(
         self,
         adapter_spec: AdapterSpec,
@@ -31,24 +29,24 @@ def evaluate_generation(
     ) -> List[Stat]:
         # Output from the model
         output_text = request_state.result.completions[0].text
-        
+
         # Initial regex pattern to match answer
-        match = re.search(r'answer is \(?([A-J])\)?', output_text)
-        
+        match = re.search(r"answer is \(?([A-J])\)?", output_text)
+
         # Secondary regex pattern if the initial one fails
         if not match:
-            match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text)
-        
+            match = re.search(r"\.\s*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
+
         # Fallback mechanism
         if match:
             extracted_answer = match.group(1)
         else:
-            extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
-        
+            extracted_answer = random.choice(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"])
+
         # Find the correct answer from references
         correct_answer = None
 
-        #option is an object with attributes
+        # option is an object with attributes
         for option in request_state.instance.references:
             if option.is_correct:
                 correct_answer = option  # Assuming 'label' holds the answer letter, e.g., "A", "B", etc.
@@ -56,8 +54,4 @@ def evaluate_generation(
 
         # Return the score in the specified format
         score = 1 if extracted_answer == correct_answer else 0
-        return [
-            Stat(MetricName("chain_of_thought_correct"), score)
-        ]
-
-
+        return [Stat(MetricName("chain_of_thought_correct"), score)]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 96ed4890cc..18611a8850 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -25,7 +25,7 @@
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path
 from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
-from helm.benchmark.metrics.metric import MetricSpec;
+from helm.benchmark.metrics.metric import MetricSpec
 
 
 @run_spec_function("narrative_qa")
@@ -330,6 +330,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         groups=["wmt_14"],
     )
 
+
 """ @run_spec_function("gpqa")
 def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
     # Convert to bools and remove the str versions
@@ -417,6 +418,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
         groups=["gpqa"],
     )"""
 
+
 @run_spec_function("gpqa")
 def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
     # Convert to bools and remove the str versions
@@ -500,6 +502,9 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
         name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),],  # TODO: update this after cot metric is ready
+        metric_specs=get_exact_match_metric_specs()
+        + [
+            MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+        ],  # TODO: update this after cot metric is ready
         groups=["gpqa"],
     )

From d676183888c00fc8dceb672c52eeafef1752fb35 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 21:09:47 -0800
Subject: [PATCH 04/18] Changes to COT Metric

---
 src/helm/benchmark/metrics/chain_of_thought_metric.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index aa9a6ebd6f..5527d3a20b 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -28,7 +28,11 @@ def evaluate_generation(
         eval_cache_path: str,
     ) -> List[Stat]:
         # Output from the model
-        output_text = request_state.result.completions[0].text
+        if request_state.result is not None and request_state.result.completions:
+            output_text = request_state.result.completions[0].text
+        else:
+            raise ValueError("Request result is None or completions is empty")
+
 
         # Initial regex pattern to match answer
         match = re.search(r"answer is \(?([A-J])\)?", output_text)

From de6b9b1781691bbcd1a1bbb1b382ce65d1e8d2a7 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 21:11:32 -0800
Subject: [PATCH 05/18] Changes made to file.

---
 src/helm/benchmark/metrics/chain_of_thought_metric.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index 5527d3a20b..ddaa692ef2 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -33,7 +33,6 @@ def evaluate_generation(
         else:
             raise ValueError("Request result is None or completions is empty")
 
-
         # Initial regex pattern to match answer
         match = re.search(r"answer is \(?([A-J])\)?", output_text)
 

From 6c09cbc886424649837031302f93d30ebd0c2e23 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 21:14:55 -0800
Subject: [PATCH 06/18] Changes made

---
 src/helm/benchmark/metrics/chain_of_thought_metric.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index ddaa692ef2..a710d49252 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -2,8 +2,6 @@
 
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
-from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
 from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
@@ -11,7 +9,6 @@
 
 import re
 import random
-from typing import List
 
 
 class ChainOfThoughtMetric(Metric):

From 2e02fb7686a2e06f085e25516f6c67a9e3d54c10 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 21:17:13 -0800
Subject: [PATCH 07/18] Committing changes

---
 .../benchmark/run_specs/lite_run_specs.py     | 89 -------------------
 1 file changed, 89 deletions(-)

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 18611a8850..d2b2b148d1 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -330,95 +330,6 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         groups=["wmt_14"],
     )
 
-
-""" @run_spec_function("gpqa")
-def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
-    # Convert to bools and remove the str versions
-    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
-    use_few_shot_bool: bool = use_few_shot == "True"
-    del use_chain_of_thought
-    del use_few_shot
-
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset}
-    )
-    max_train_instance_num = 5 if use_few_shot_bool else 0
-
-    if use_few_shot_bool:
-        if use_chain_of_thought_bool:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_tokens=1000,  # following original repo
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                chain_of_thought_prefix="Let's think step by step: ",
-                chain_of_thought_suffix="The correct answer is ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="",
-                global_suffix=(
-                    "Give step by step reasoning before you answer, and when you’re ready to answer, "
-                    'please use the format "The correct answer is (insert answer here)":'
-                ),
-            )
-        else:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="The correct answer is ",
-            )
-    else:
-        if use_chain_of_thought_bool:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=(
-                    "Let’s think step by step. Based on your reasoning, what is the single, "
-                    "most likely answer choice? Format your response as follows: "
-                    '"The correct answer is (insert answer here)".'
-                ),
-            )
-        else:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
-            )
-
-    return RunSpec(
-        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),],  # TODO: update this after cot metric is ready
-        groups=["gpqa"],
-    )"""
-
-
 @run_spec_function("gpqa")
 def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
     # Convert to bools and remove the str versions

From d039a9d354d0c6657f4943de2e2f1116916f9fe5 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 21:17:55 -0800
Subject: [PATCH 08/18] Changes committed

---
 src/helm/benchmark/run_specs/lite_run_specs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index d2b2b148d1..359ce157f2 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -330,6 +330,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         groups=["wmt_14"],
     )
 
+
 @run_spec_function("gpqa")
 def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
     # Convert to bools and remove the str versions

From af0318521ccb85a094ea369925770971b77b6c0f Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 23:29:23 -0800
Subject: [PATCH 09/18] orrect changes to metric

---
 .../metrics/chain_of_thought_metric.py        | 53 ++++++++++---------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index a710d49252..1ebbd15e26 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -11,11 +11,26 @@
 import random
 
 
-class ChainOfThoughtMetric(Metric):
-    """Replacement for BasicGenerationMetric for AIRBench 2024.
+def extract_answer(output_text: str) -> str:
+    """
+    Extracts the answer from the output text using two exact regex patterns.
+    Returns "N/A" if no valid answer is found.
+    """
+    # First regex: Matches "answer is (A-J)" with optional parentheses
+    match = re.search(r"answer is \(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
+    match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # If neither regex matches, return "N/A"
+    return "N/A"
 
-    We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
-    because we abuse "references" to store metadata rather than true metadata."""
+class ChainOfThoughtMetric(Metric):
+    """Replacement for BasicGenerationMetric for AIRBench 2024."""
 
     def evaluate_generation(
         self,
@@ -27,31 +42,17 @@ def evaluate_generation(
         # Output from the model
         if request_state.result is not None and request_state.result.completions:
             output_text = request_state.result.completions[0].text
-        else:
-            raise ValueError("Request result is None or completions is empty")
-
-        # Initial regex pattern to match answer
-        match = re.search(r"answer is \(?([A-J])\)?", output_text)
-
-        # Secondary regex pattern if the initial one fails
-        if not match:
-            match = re.search(r"\.\s*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
+        
+        # Extract the answer using the updated logic
+        extracted_answer = extract_answer(output_text)
 
-        # Fallback mechanism
-        if match:
-            extracted_answer = match.group(1)
-        else:
-            extracted_answer = random.choice(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"])
-
-        # Find the correct answer from references
+        # Find the correct answer from references by translating index to letter
         correct_answer = None
-
-        # option is an object with attributes
-        for option in request_state.instance.references:
+        for index, option in enumerate(request_state.instance.references):
             if option.is_correct:
-                correct_answer = option  # Assuming 'label' holds the answer letter, e.g., "A", "B", etc.
+                correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
                 break
 
-        # Return the score in the specified format
         score = 1 if extracted_answer == correct_answer else 0
-        return [Stat(MetricName("chain_of_thought_correct"), score)]
+        # Return the score in the specified format
+        return [Stat(MetricName("chain_of_thought_correct"), score)]
\ No newline at end of file

From d675da0ae0e8fd737baec0a066dffe0d94a807ea Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 23:29:55 -0800
Subject: [PATCH 10/18] format changes

---
 src/helm/benchmark/metrics/chain_of_thought_metric.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index 1ebbd15e26..e1c6f53769 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -29,6 +29,7 @@ def extract_answer(output_text: str) -> str:
     # If neither regex matches, return "N/A"
     return "N/A"
 
+
 class ChainOfThoughtMetric(Metric):
     """Replacement for BasicGenerationMetric for AIRBench 2024."""
 
@@ -42,7 +43,7 @@ def evaluate_generation(
         # Output from the model
         if request_state.result is not None and request_state.result.completions:
             output_text = request_state.result.completions[0].text
-        
+
         # Extract the answer using the updated logic
         extracted_answer = extract_answer(output_text)
 
@@ -55,4 +56,4 @@ def evaluate_generation(
 
         score = 1 if extracted_answer == correct_answer else 0
         # Return the score in the specified format
-        return [Stat(MetricName("chain_of_thought_correct"), score)]
\ No newline at end of file
+        return [Stat(MetricName("chain_of_thought_correct"), score)]

From 16afbbeb2bf48abb637c8ead968522dfbd356336 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 23:31:01 -0800
Subject: [PATCH 11/18] changes

---
 src/helm/benchmark/metrics/chain_of_thought_metric.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index e1c6f53769..a97bf95e5e 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -8,7 +8,6 @@
 from helm.benchmark.metrics.statistic import Stat
 
 import re
-import random
 
 
 def extract_answer(output_text: str) -> str:

From d367578959f69d950982df4127d4ba30febca95e Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 23:55:44 -0800
Subject: [PATCH 12/18] changes to file

---
 .../metrics/air_bench_metrics copy.py         | 56 -------------------
 .../metrics/chain_of_thought_metric.py        |  2 +
 2 files changed, 2 insertions(+), 56 deletions(-)
 delete mode 100644 src/helm/benchmark/metrics/air_bench_metrics copy.py

diff --git a/src/helm/benchmark/metrics/air_bench_metrics copy.py b/src/helm/benchmark/metrics/air_bench_metrics copy.py
deleted file mode 100644
index 97401b9978..0000000000
--- a/src/helm/benchmark/metrics/air_bench_metrics copy.py	
+++ /dev/null
@@ -1,56 +0,0 @@
-from typing import List
-
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
-from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
-from helm.benchmark.metrics.metric import Metric
-from helm.benchmark.metrics.metric_name import MetricName
-from helm.benchmark.metrics.metric_service import MetricService
-from helm.benchmark.metrics.statistic import Stat
-
-
-class AIRBench2024BasicGenerationMetric(Metric):
-    """Replacement for BasicGenerationMetric for AIRBench 2024.
-
-    We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
-    because we abuse "references" to store metadata rather than true metadata."""
-
-    def __init__(self):
-        super().__init__()
-        self.efficiency_metric = EfficiencyMetric()
-
-    def evaluate_generation(
-        self,
-        adapter_spec: AdapterSpec,
-        request_state: RequestState,
-        metric_service: MetricService,
-        eval_cache_path: str,
-    ) -> List[Stat]:
-        return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
-
-
-class AIRBench2024ScoreMetric(Metric):
-    """Score metrics for AIRBench 2024."""
-
-    def evaluate_generation(
-        self,
-        adapter_spec: AdapterSpec,
-        request_state: RequestState,
-        metric_service: MetricService,
-        eval_cache_path: str,
-    ) -> List[Stat]:
-        assert len(request_state.instance.references) > 1
-        category_text = request_state.instance.references[0].output.text
-        category_parts = category_text.split(".")
-        assert len(category_parts) == 3
-        assert request_state.annotations
-        score = request_state.annotations["air_bench_2024"]["score"]
-        return [
-            Stat(MetricName("air_score")).add(score),
-            Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score),
-            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score),
-            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add(
-                score
-            ),
-        ]
diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index a97bf95e5e..fd28dc47b9 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -52,6 +52,8 @@ def evaluate_generation(
             if option.is_correct:
                 correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
                 break
+        
+        print(request_state.instance.id, correct_answer, extracted_answer)
 
         score = 1 if extracted_answer == correct_answer else 0
         # Return the score in the specified format

From 23968c24612e8d3345be0b2cd6accf6f319602f1 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 23:56:42 -0800
Subject: [PATCH 13/18] changed format

---
 src/helm/benchmark/metrics/chain_of_thought_metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index fd28dc47b9..c7d8560cba 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -52,7 +52,7 @@ def evaluate_generation(
             if option.is_correct:
                 correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
                 break
-        
+
         print(request_state.instance.id, correct_answer, extracted_answer)
 
         score = 1 if extracted_answer == correct_answer else 0

From 90ac1947faca4d7b1bad81d5aae1bd7f372f20bf Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 23:59:06 -0800
Subject: [PATCH 14/18] changes to file by deleting

---
 src/helm/benchmark/run_specs/lite_run_specs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 552b80267f..bac947e3ca 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -20,8 +20,7 @@
     get_f1_metric_specs,
     get_generative_harms_metric_specs,
     get_generic_metric_specs,
-    get_open_ended_generation_metric_specs,
-    MetricSpec,
+    get_open_ended_generation_metric_specs
 )
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path

From 7cfbb1c2db1fd1f954ebd970a7882681556b8ed0 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Wed, 13 Nov 2024 23:59:48 -0800
Subject: [PATCH 15/18] reformat file

---
 src/helm/benchmark/run_specs/lite_run_specs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index bac947e3ca..7fd40dcf91 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -20,7 +20,7 @@
     get_f1_metric_specs,
     get_generative_harms_metric_specs,
     get_generic_metric_specs,
-    get_open_ended_generation_metric_specs
+    get_open_ended_generation_metric_specs,
 )
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path

From c87682861b9b6f6d2a05d36ec7e8cb0b7e7c38bf Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Fri, 15 Nov 2024 02:46:23 -0800
Subject: [PATCH 16/18] changes in files for schema_lite_z2.yaml

---
 .../metrics/chain_of_thought_metric.py        |  2 +-
 src/helm/benchmark/scenarios/mmlu_pro.py      | 34 +++++++++++++++----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index c7d8560cba..23eb3e65a0 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -57,4 +57,4 @@ def evaluate_generation(
 
         score = 1 if extracted_answer == correct_answer else 0
         # Return the score in the specified format
-        return [Stat(MetricName("chain_of_thought_correct"), score)]
+        return [Stat(MetricName("chain_of_thought_correct")).add(score)]
diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py b/src/helm/benchmark/scenarios/mmlu_pro.py
index a091387dc2..4b50f4e9df 100644
--- a/src/helm/benchmark/scenarios/mmlu_pro.py
+++ b/src/helm/benchmark/scenarios/mmlu_pro.py
@@ -1,9 +1,17 @@
 from typing import Dict, List
-from datasets import load_dataset
+from datasets import Dataset, load_dataset
 
 from helm.common.hierarchical_logger import hlog
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
-
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 
 class MMLUProScenario(Scenario):
     """
@@ -33,7 +41,14 @@ def __init__(self, subject: str):
         super().__init__()
         self.subject: str = subject
 
-    def process_csv(self, data, split: str) -> List[Instance]:
+    def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
+        """
+        Process the dataset to create instances.
+
+        :param data: Hugging Face `Dataset` containing the data for a specific split.
+        :param split: The data split (e.g., "train", "test").
+        :return: A list of processed `Instance` objects.
+        """
         instances: List[Instance] = []
         hlog(f"Processing data for {split} split")
         for row in data:
@@ -55,8 +70,14 @@ def answer_to_reference(answer: str) -> Reference:
         return instances
 
     def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Load and process the MMLU-Pro dataset to create instances.
+
+        :param output_path: Path to save or output the processed instances.
+        :return: A list of all processed `Instance` objects.
+        """
         # Load the MMLU-Pro dataset from Hugging Face
-        dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+        dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b")
 
         # Process all the instances
         instances: List[Instance] = []
@@ -66,6 +87,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
         }
         for hf_split, split in splits.items():
             data = dataset[hf_split].filter(lambda x: x["category"] == self.subject)
-            instances.extend(self.process_csv(data, split))
+            instances.extend(self.process_dataset(data, split))
 
         return instances
+

From 97a9affa745ded322611892f293703bc6da8b1ef Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Fri, 15 Nov 2024 03:11:31 -0800
Subject: [PATCH 17/18] Changes to address comments

---
 .../metrics/chain_of_thought_metric.py        | 59 ++++++++++++++-----
 .../benchmark/run_specs/lite_run_specs.py     |  2 +-
 src/helm/benchmark/static/schema_lite_v2.yaml | 28 ++++-----
 3 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index 23eb3e65a0..67efdbb942 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -1,4 +1,5 @@
-from typing import List
+import re
+from typing import List, Optional
 
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
@@ -7,13 +8,16 @@
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
 
-import re
-
-
-def extract_answer(output_text: str) -> str:
+def extract_answer(output_text: str) -> Optional[str]:
     """
     Extracts the answer from the output text using two exact regex patterns.
-    Returns "N/A" if no valid answer is found.
+    Returns None if no valid answer is found.
+
+    Args:
+        output_text (str): The text from which to extract the answer.
+
+    Returns:
+        Optional[str]: The extracted answer (A-J) if found, otherwise None.
     """
     # First regex: Matches "answer is (A-J)" with optional parentheses
     match = re.search(r"answer is \(?([A-J])\)?", output_text)
@@ -25,12 +29,16 @@ def extract_answer(output_text: str) -> str:
     if match:
         return match.group(1)
 
-    # If neither regex matches, return "N/A"
-    return "N/A"
+    # If neither regex matches, return None
+    return None
 
 
 class ChainOfThoughtMetric(Metric):
-    """Replacement for BasicGenerationMetric for AIRBench 2024."""
+    """
+    This metric focuses on structured reasoning and the accuracy of extracted answers. 
+    It compares model outputs against correct answers provided in a multiple-choice 
+    format and returns a score indicating the correctness of the generated response.
+    """
 
     def evaluate_generation(
         self,
@@ -39,9 +47,30 @@ def evaluate_generation(
         metric_service: MetricService,
         eval_cache_path: str,
     ) -> List[Stat]:
-        # Output from the model
-        if request_state.result is not None and request_state.result.completions:
-            output_text = request_state.result.completions[0].text
+        """
+        Evaluate the generated output for chain-of-thought reasoning accuracy.
+
+        The method extracts the model's output, determines the correct answer 
+        from the provided references, and compares the two to compute a binary score.
+
+        Args:
+            adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
+            request_state (RequestState): The state of the current request, including 
+                                          the input instance, output results, and references.
+            metric_service (MetricService): A service used to compute metrics if needed.
+            eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
+
+        Returns:
+            List[Stat]: A list containing a single `Stat` object with the correctness 
+                        score (1 for correct, 0 for incorrect) under the metric 
+                        name "chain_of_thought_correct".
+        """
+        # Assert that completions exist if the result is not None
+        assert request_state.result is not None and request_state.result.completions, \
+            "Request state result must have completions."
+
+        # Set output_text if the assertion passes
+        output_text = request_state.result.completions[0].text
 
         # Extract the answer using the updated logic
         extracted_answer = extract_answer(output_text)
@@ -53,8 +82,10 @@ def evaluate_generation(
                 correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
                 break
 
-        print(request_state.instance.id, correct_answer, extracted_answer)
+        # Raise an exception if no correct answer is found
+        if correct_answer is None:
+            raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")
 
+        # Compare extracted answer with the correct answer and compute the score
         score = 1 if extracted_answer == correct_answer else 0
-        # Return the score in the specified format
         return [Stat(MetricName("chain_of_thought_correct")).add(score)]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 7fd40dcf91..41ab3e1477 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -417,7 +417,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
         metric_specs=get_exact_match_metric_specs()
         + [
             MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
-        ],  # TODO: update this after cot metric is ready
+        ],  
         groups=["gpqa"],
     )
 
diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml
index 47730d2d68..b00b87e76f 100644
--- a/src/helm/benchmark/static/schema_lite_v2.yaml
+++ b/src/helm/benchmark/static/schema_lite_v2.yaml
@@ -88,6 +88,16 @@ metrics:
     short_display_name: PEM
     description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
     lower_is_better: false
+  - name: ifeval_strict_accuracy
+    display_name: IFEval strict accuracy
+    short_display_name: IFEval Strict Acc
+    description: Fraction of instructions in the instance that are correctly followed.
+    lower_is_better: false
+  - name: chain_of_thought_correct
+    display_name: COT correct
+    short_display_name: COT correct
+    description: TBD.
+    lower_is_better: false
 
 ############################################################
 perturbations: []
@@ -130,7 +140,6 @@ run_groups:
     subgroups:
       - mmlu_pro
       - gpqa
-      - ifeval
 
   - name: mmlu_pro
     display_name: MMLU-Pro
@@ -165,20 +174,3 @@ run_groups:
       who: "?"
       when: "?"
       language: English
-  
-  - name: ifeval
-    display_name: IFEval
-    description: IFEval
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: ifeval_strict_accuracy
-      main_split: test
-    taxonomy:
-      task: "?"
-      what: "?"
-      who: "?"
-      when: "?"
-      language: English

From 6d5eb55f634bd2a4c8a921d17e74708e29a47641 Mon Sep 17 00:00:00 2001
From: siyagoel <siyagoel@stanford.edu>
Date: Fri, 15 Nov 2024 03:12:13 -0800
Subject: [PATCH 18/18] changes added based on comments

---
 .../metrics/chain_of_thought_metric.py         | 18 ++++++++++--------
 src/helm/benchmark/run_specs/lite_run_specs.py |  2 +-
 src/helm/benchmark/scenarios/mmlu_pro.py       |  2 +-
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
index 67efdbb942..32cfd880f3 100644
--- a/src/helm/benchmark/metrics/chain_of_thought_metric.py
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -8,6 +8,7 @@
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
 
+
 def extract_answer(output_text: str) -> Optional[str]:
     """
     Extracts the answer from the output text using two exact regex patterns.
@@ -35,8 +36,8 @@ def extract_answer(output_text: str) -> Optional[str]:
 
 class ChainOfThoughtMetric(Metric):
     """
-    This metric focuses on structured reasoning and the accuracy of extracted answers. 
-    It compares model outputs against correct answers provided in a multiple-choice 
+    This metric focuses on structured reasoning and the accuracy of extracted answers.
+    It compares model outputs against correct answers provided in a multiple-choice
     format and returns a score indicating the correctness of the generated response.
     """
 
@@ -50,24 +51,25 @@ def evaluate_generation(
         """
         Evaluate the generated output for chain-of-thought reasoning accuracy.
 
-        The method extracts the model's output, determines the correct answer 
+        The method extracts the model's output, determines the correct answer
         from the provided references, and compares the two to compute a binary score.
 
         Args:
             adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
-            request_state (RequestState): The state of the current request, including 
+            request_state (RequestState): The state of the current request, including
                                           the input instance, output results, and references.
             metric_service (MetricService): A service used to compute metrics if needed.
             eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
 
         Returns:
-            List[Stat]: A list containing a single `Stat` object with the correctness 
-                        score (1 for correct, 0 for incorrect) under the metric 
+            List[Stat]: A list containing a single `Stat` object with the correctness
+                        score (1 for correct, 0 for incorrect) under the metric
                         name "chain_of_thought_correct".
         """
         # Assert that completions exist if the result is not None
-        assert request_state.result is not None and request_state.result.completions, \
-            "Request state result must have completions."
+        assert (
+            request_state.result is not None and request_state.result.completions
+        ), "Request state result must have completions."
 
         # Set output_text if the assertion passes
         output_text = request_state.result.completions[0].text
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 41ab3e1477..e7c5ea8a83 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -417,7 +417,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
         metric_specs=get_exact_match_metric_specs()
         + [
             MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
-        ],  
+        ],
         groups=["gpqa"],
     )
 
diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py b/src/helm/benchmark/scenarios/mmlu_pro.py
index 4b50f4e9df..5d08d4f9d1 100644
--- a/src/helm/benchmark/scenarios/mmlu_pro.py
+++ b/src/helm/benchmark/scenarios/mmlu_pro.py
@@ -13,6 +13,7 @@
     Output,
 )
 
+
 class MMLUProScenario(Scenario):
     """
     The MMLU-Pro dataset is an advanced version of the Massive Multitask Language Understanding (MMLU)
@@ -90,4 +91,3 @@ def get_instances(self, output_path: str) -> List[Instance]:
             instances.extend(self.process_dataset(data, split))
 
         return instances
-