Please enter the commit message for your changes. Lines starting

stanford-crfm · Dec 6, 2024 · 4388c8e · 4388c8e
1 parent 33a65de
commit 4388c8e
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 280 deletions.
diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric_correctness.py b/src/helm/benchmark/metrics/chain_of_thought_metric_correctness.py
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -181,7 +181,6 @@ def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few
             input_prefix="What is the correct answer to this question: ",
             input_suffix="\nChoices:\n",
             output_prefix="",
-            reference_prefix="(A) ",
             global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
         )
         return RunSpec(

diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py b/src/helm/benchmark/scenarios/mmlu_pro.py
diff --git a/src/helm/benchmark/scenarios/mmlu_scenario_pro.py b/src/helm/benchmark/scenarios/mmlu_scenario_pro.py
diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml
@@ -159,6 +159,20 @@ run_groups:
       when: "?"
       language: English
 
+  - name: ifeval
+    display_name: IFEval
+    description: IFEval
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: ifeval_strict_accuracy
+      main_name: chain_of_thought_correct  # non-CoT
+      main_split: test
+    taxonomy:
+      task: "?"
+
   - name: gpqa
     display_name: GPQA
     description: GPQA