diff --git a/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf new file mode 100644 index 0000000000..812c23d454 --- /dev/null +++ b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf @@ -0,0 +1,54 @@ +# Alternate version of run_entries_lite_20240424_output_format_instructions.conf that: +# - provides extra instructions for LegalBench +# - provides instructions to omit the output noun for LegalBench +# - omits asking for boolean answers for natural_qa +# Prefer using run_entries_lite_20240424_output_format_instructions.conf instead. + +entries: [ + # NarrativeQA + {description: "narrative_qa:model=text,output_format_instructions=narrative_qa", priority: 1} + + # NaturalQuestions + {description: "natural_qa:model=text,mode=openbook_longans,output_format_instructions=natural_qa_short_answer", priority: 1} + {description: "natural_qa:model=text,mode=closedbook,output_format_instructions=natural_qa_short_answer", priority: 1} + + # OpenbookQA + {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1} + + # MMLU + {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2} + + # MATH + {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2} + + # GSM + {description: "gsm:model=text_code,stop=none", priority: 2} + + # LegalBench + {description: "legalbench:model=text_code,subset=abercrombie,output_format_instructions=legalbench_abercrombie_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=corporate_lobbying,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=international_citizenship_questions,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=function_of_decision_section,output_format_instructions=legalbench_function_of_decision_section_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=proa,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} + + # MedQA + {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2} + +# # WMT14 + {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14", priority: 2} +] + diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index 8f72a83624..b593fd09bc 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -1426,14 +1426,20 @@ class OutputFormatInstructions(RunExpander): name = "output_format_instructions" _SUFFIX_SUFFIX = "_suffix" + _NO_PREFIX_SUFFIX = "_no_prefix" def __init__(self, scenario: str): + self.suffix = False if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX): - self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)] + scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX) self.suffix = True - else: - self.scenario = scenario - self.suffix = False + + self.no_prefix = False + if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX): + scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX) + self.no_prefix = True + + self.scenario = scenario def expand(self, run_spec: RunSpec) -> List[RunSpec]: if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT: @@ -1454,6 +1460,8 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: ) elif self.scenario == "natural_qa": instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer." + elif self.scenario == "natural_qa_short_answer": + instructions = "Answer with a short answer." elif self.scenario == "legalbench": if output_noun != "Answer": instructions = f"Answer with the {output_noun.lower()}." @@ -1469,8 +1477,6 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: instructions = "Answer with the English translation." elif self.scenario == "wmt_14_only_last_sentence": instructions = "Answer with only the English translation for the last sentence." - elif self.scenario == "wmt_14_no_prefix": - instructions = "Answer with the English translation. Do not include 'English:' in your answer." elif self.scenario == "math": instructions = "Wrap the final answer with the \\boxed{} command." elif self.scenario == "numeric_nlg": @@ -1487,6 +1493,11 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: else: raise ValueError(f"Unknown scenario {self.scenario}") + if self.no_prefix: + if instructions: + instructions += " " + instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer." + if self.suffix: return [ replace(