From 07f38b2441d0fc9f65c37877f4b887e23d87751a Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Thu, 19 Dec 2024 20:28:18 -0800 Subject: [PATCH 1/5] Remove boolean answer from instructions for natural_qa --- src/helm/benchmark/run_expander.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index 8f72a83624..e4826a9982 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -1454,6 +1454,8 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: ) elif self.scenario == "natural_qa": instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer." + elif self.scenario == "natural_qa_short_answer": + instructions = "Answer with a short answer." elif self.scenario == "legalbench": if output_noun != "Answer": instructions = f"Answer with the {output_noun.lower()}." From 26be45277c1dc7307c9dce53846e8db2216bcbbd Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Thu, 19 Dec 2024 20:40:10 -0800 Subject: [PATCH 2/5] Add conf file --- .../run_entries_lite_20240424_nova.conf | 54 +++++++++++++++++++ src/helm/benchmark/run_expander.py | 19 +++++-- 2 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf diff --git a/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf new file mode 100644 index 0000000000..d7f07ae92f --- /dev/null +++ b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf @@ -0,0 +1,54 @@ +# Alternate version of run_entries_lite_20240424_output_format_instructions.conf that: +# - provides extra instructions for LegalBench +# - provides instructions to omit the output noun for LegalBench +# - omits asking for boolean answers for natural_qa +# Prefer using run_entries_lite_20240424_output_format_instructions.conf instead. + +entries: [ + # NarrativeQA + {description: "narrative_qa:model=text,output_format_instructions=narrative_qa", priority: 1} + + # NaturalQuestions + {description: "natural_qa:model=text,mode=openbook_longans,output_format_instructions=natural_qa_short_answer", priority: 1} + {description: "natural_qa:model=text,mode=closedbook,output_format_instructions=natural_qa_short_answer", priority: 1} + + # OpenbookQA + {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1} + + # MMLU + {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2} + + # MATH + {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2} + + # GSM + {description: "gsm:model=text_code,stop=none", priority: 2} + + # LegalBench + {description: "legalbench:model=text_code,subset=abercrombie,output_format_instructions=legalbench_abercrombie_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=corporate_lobbying,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=international_citizenship_questions,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=function_of_decision_section,output_format_instructions=legalbench_function_of_decision_section_no_prefix,stop=none", priority: 2} + {description: "legalbench:model=text_code,subset=proa,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} + + # MedQA + {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2} + + # WMT14 + {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14", priority: 2} +] + diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index e4826a9982..751c265d15 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -1426,14 +1426,20 @@ class OutputFormatInstructions(RunExpander): name = "output_format_instructions" _SUFFIX_SUFFIX = "_suffix" + _NO_PREFIX_SUFFIX = "_no_prefix" def __init__(self, scenario: str): + self.suffix = False if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX): - self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)] + scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX) self.suffix = True - else: - self.scenario = scenario - self.suffix = False + + self.no_prefix = False + if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX): + scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX) + self.no_prefix = True + + self.scenario = scenario def expand(self, run_spec: RunSpec) -> List[RunSpec]: if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT: @@ -1488,6 +1494,11 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: ) else: raise ValueError(f"Unknown scenario {self.scenario}") + + if self.no_prefix: + if instructions: + instructions += " " + instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer." if self.suffix: return [ From 4625afc3eb1466869f14b39102c2160e3a0c2f1d Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Thu, 19 Dec 2024 21:00:11 -0800 Subject: [PATCH 3/5] Add run entries --- .../run_entries_lite_20240424_nova.conf | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf index d7f07ae92f..6cecdb033e 100644 --- a/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf +++ b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf @@ -12,27 +12,27 @@ entries: [ {description: "natural_qa:model=text,mode=openbook_longans,output_format_instructions=natural_qa_short_answer", priority: 1} {description: "natural_qa:model=text,mode=closedbook,output_format_instructions=natural_qa_short_answer", priority: 1} - # OpenbookQA - {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1} - - # MMLU - {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2} - {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2} - {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2} - {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2} - {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2} - - # MATH - {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2} - {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2} - {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2} - {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2} - {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2} - {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2} - {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2} - - # GSM - {description: "gsm:model=text_code,stop=none", priority: 2} +# # OpenbookQA +# {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1} + +# # MMLU +# {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2} +# {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2} +# {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2} +# {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2} +# {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2} + +# # MATH +# {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2} +# {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2} +# {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2} +# {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2} +# {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2} +# {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2} +# {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2} + +# # GSM +# {description: "gsm:model=text_code,stop=none", priority: 2} # LegalBench {description: "legalbench:model=text_code,subset=abercrombie,output_format_instructions=legalbench_abercrombie_no_prefix,stop=none", priority: 2} @@ -41,14 +41,14 @@ entries: [ {description: "legalbench:model=text_code,subset=function_of_decision_section,output_format_instructions=legalbench_function_of_decision_section_no_prefix,stop=none", priority: 2} {description: "legalbench:model=text_code,subset=proa,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} - # MedQA - {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2} +# # MedQA +# {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2} - # WMT14 - {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14", priority: 2} - {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14", priority: 2} - {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14", priority: 2} - {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14", priority: 2} - {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14", priority: 2} +# # WMT14 + {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} + {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} + {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} + {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} + {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} ] From 085778de2342ed178971d545f02dd439cc252138 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Thu, 19 Dec 2024 21:01:13 -0800 Subject: [PATCH 4/5] Lint --- src/helm/benchmark/run_expander.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index 751c265d15..0416f8c60c 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -1433,7 +1433,7 @@ def __init__(self, scenario: str): if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX): scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX) self.suffix = True - + self.no_prefix = False if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX): scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX) @@ -1494,7 +1494,7 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: ) else: raise ValueError(f"Unknown scenario {self.scenario}") - + if self.no_prefix: if instructions: instructions += " " From 762a741fa65b8ecebd2f568142c2196cb1498893 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Thu, 19 Dec 2024 21:07:49 -0800 Subject: [PATCH 5/5] More fixes --- .../run_entries_lite_20240424_nova.conf | 56 +++++++++---------- src/helm/benchmark/run_expander.py | 2 - 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf index 6cecdb033e..812c23d454 100644 --- a/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf +++ b/src/helm/benchmark/presentation/run_entries_lite_20240424_nova.conf @@ -12,27 +12,27 @@ entries: [ {description: "natural_qa:model=text,mode=openbook_longans,output_format_instructions=natural_qa_short_answer", priority: 1} {description: "natural_qa:model=text,mode=closedbook,output_format_instructions=natural_qa_short_answer", priority: 1} -# # OpenbookQA -# {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1} - -# # MMLU -# {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2} -# {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2} -# {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2} -# {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2} -# {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2} - -# # MATH -# {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2} -# {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2} -# {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2} -# {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2} -# {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2} -# {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2} -# {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2} - -# # GSM -# {description: "gsm:model=text_code,stop=none", priority: 2} + # OpenbookQA + {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1} + + # MMLU + {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2} + + # MATH + {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2} + + # GSM + {description: "gsm:model=text_code,stop=none", priority: 2} # LegalBench {description: "legalbench:model=text_code,subset=abercrombie,output_format_instructions=legalbench_abercrombie_no_prefix,stop=none", priority: 2} @@ -41,14 +41,14 @@ entries: [ {description: "legalbench:model=text_code,subset=function_of_decision_section,output_format_instructions=legalbench_function_of_decision_section_no_prefix,stop=none", priority: 2} {description: "legalbench:model=text_code,subset=proa,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2} -# # MedQA -# {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2} + # MedQA + {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2} # # WMT14 - {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} - {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} - {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} - {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} - {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14_no_prefix", priority: 2} + {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14", priority: 2} ] diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index 0416f8c60c..b593fd09bc 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -1477,8 +1477,6 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: instructions = "Answer with the English translation." elif self.scenario == "wmt_14_only_last_sentence": instructions = "Answer with only the English translation for the last sentence." - elif self.scenario == "wmt_14_no_prefix": - instructions = "Answer with the English translation. Do not include 'English:' in your answer." elif self.scenario == "math": instructions = "Wrap the final answer with the \\boxed{} command." elif self.scenario == "numeric_nlg":