stanford-crfm · yifanmai · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf
@@ -0,0 +1,54 @@
+# Alternate version of run_entries_lite_20240424_output_format_instructions.conf that:
+# - provides extra instructions for LegalBench
+# - provides instructions to omit the output noun for LegalBench
+# - omits asking for boolean answers for natural_qa
+# Prefer using run_entries_lite_20240424_output_format_instructions.conf instead.
+
+entries: [
+  # NarrativeQA
+  {description: "narrative_qa:model=text,output_format_instructions=narrative_qa", priority: 1}
+
+  # NaturalQuestions
+  {description: "natural_qa:model=text,mode=openbook_longans,output_format_instructions=natural_qa_short_answer", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,output_format_instructions=natural_qa_short_answer", priority: 1}
+
+  # OpenbookQA
+  {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1}
+
+  # MMLU
+  {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2}
+  {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2}
+  {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2}
+
+  # MATH
+  {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2}
+
+  # GSM
+  {description: "gsm:model=text_code,stop=none", priority: 2}
+
+  # LegalBench
+  {description: "legalbench:model=text_code,subset=abercrombie,output_format_instructions=legalbench_abercrombie_no_prefix,stop=none", priority: 2}
+  {description: "legalbench:model=text_code,subset=corporate_lobbying,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2}
+  {description: "legalbench:model=text_code,subset=international_citizenship_questions,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2}
+  {description: "legalbench:model=text_code,subset=function_of_decision_section,output_format_instructions=legalbench_function_of_decision_section_no_prefix,stop=none", priority: 2}
+  {description: "legalbench:model=text_code,subset=proa,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2}
+
+  # MedQA
+  {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2}
+
+#   # WMT14
+  {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14", priority: 2}
+  {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14", priority: 2}
+  {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14", priority: 2}
+  {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14", priority: 2}
+  {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14", priority: 2}
+]
+
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
@@ -1426,14 +1426,20 @@ class OutputFormatInstructions(RunExpander):
     name = "output_format_instructions"
 
     _SUFFIX_SUFFIX = "_suffix"
+    _NO_PREFIX_SUFFIX = "_no_prefix"
 
     def __init__(self, scenario: str):
+        self.suffix = False
         if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
-            self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
+            scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX)
             self.suffix = True
-        else:
-            self.scenario = scenario
-            self.suffix = False
+
+        self.no_prefix = False
+        if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX):
+            scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX)
+            self.no_prefix = True
+
+        self.scenario = scenario
 
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
@@ -1454,6 +1460,8 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
                 )
             elif self.scenario == "natural_qa":
                 instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
+            elif self.scenario == "natural_qa_short_answer":
+                instructions = "Answer with a short answer."
             elif self.scenario == "legalbench":
                 if output_noun != "Answer":
                     instructions = f"Answer with the {output_noun.lower()}."
@@ -1469,8 +1477,6 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
                 instructions = "Answer with the English translation."
             elif self.scenario == "wmt_14_only_last_sentence":
                 instructions = "Answer with only the English translation for the last sentence."
-            elif self.scenario == "wmt_14_no_prefix":
-                instructions = "Answer with the English translation. Do not include 'English:' in your answer."
             elif self.scenario == "math":
                 instructions = "Wrap the final answer with the \\boxed{} command."
             elif self.scenario == "numeric_nlg":
@@ -1487,6 +1493,11 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
             else:
                 raise ValueError(f"Unknown scenario {self.scenario}")
 
+        if self.no_prefix:
+            if instructions:
+                instructions += " "
+            instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
+
         if self.suffix:
             return [
                 replace(