Changes to formatting

stanford-crfm · Dec 6, 2024 · 33a65de · 33a65de
1 parent d71d92d
commit 33a65de
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 51 deletions.
diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
@@ -179,7 +179,8 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat
         derived_stats: List[Stat] = []
         derived_stats.extend(compute_calibration_metrics(per_instance_stats))
         return derived_stats
-
+
+
 class BasicReferenceMetric(ReferenceMetric):
     """
     Defines basic metrics for Scenarios that use one Request per Reference instead of

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -169,7 +169,7 @@ def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few
             adapter_spec=adapter_spec,
             metric_specs=get_exact_match_metric_specs()
             + [
-            MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
             ],
             groups=["mmlu_pro"],
         )
@@ -380,56 +380,56 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
     max_train_instance_num = 5 if use_few_shot_bool else 0
 
     if use_chain_of_thought_bool:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_tokens=1000,  # following original repo
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                chain_of_thought_prefix="Let's think step by step: ",
-                chain_of_thought_suffix="The correct answer is ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="",
-            )
-            return RunSpec(
-                name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
-                scenario_spec=scenario_spec,
-                adapter_spec=adapter_spec,
-                metric_specs=get_exact_match_metric_specs()
-                + [
-                    MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
-                ],
-                groups=["gpqa"],
-            )
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            max_tokens=1000,  # following original repo
+            max_train_instances=max_train_instance_num,
+            instructions=(
+                "Here are some example questions from experts. "
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand."
+            ),
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            chain_of_thought_prefix="Let's think step by step: ",
+            chain_of_thought_suffix="The correct answer is ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="",
+        )
+        return RunSpec(
+            name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs()
+            + [
+                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+            ],
+            groups=["gpqa"],
+        )
     else:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="The correct answer is ",
-            )
-
-            return RunSpec(
-                name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
-                scenario_spec=scenario_spec,
-                adapter_spec=adapter_spec,
-                metric_specs=get_exact_match_metric_specs(),
-                groups=["gpqa"],
-            )
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            max_train_instances=max_train_instance_num,
+            instructions=(
+                "Here are some example questions from experts. "
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand."
+            ),
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="The correct answer is ",
+        )
+
+        return RunSpec(
+            name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs(),
+            groups=["gpqa"],
+        )
 
 
 @run_spec_function("ifeval")