Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more instructions customization to output_format_instructions run expander #3233

Merged
merged 5 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Alternate version of run_entries_lite_20240424_output_format_instructions.conf that:
# - provides extra instructions for LegalBench
# - provides instructions to omit the output noun for LegalBench
# - omits asking for boolean answers for natural_qa
# Prefer using run_entries_lite_20240424_output_format_instructions.conf instead.

entries: [
# NarrativeQA
{description: "narrative_qa:model=text,output_format_instructions=narrative_qa", priority: 1}

# NaturalQuestions
{description: "natural_qa:model=text,mode=openbook_longans,output_format_instructions=natural_qa_short_answer", priority: 1}
{description: "natural_qa:model=text,mode=closedbook,output_format_instructions=natural_qa_short_answer", priority: 1}

# OpenbookQA
{description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1}

# MMLU
{description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2}
{description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2}
{description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2}
{description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2}
{description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2}

# MATH
{description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2}
{description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2}
{description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2}
{description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2}
{description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2}
{description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2}
{description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2}

# GSM
{description: "gsm:model=text_code,stop=none", priority: 2}

# LegalBench
{description: "legalbench:model=text_code,subset=abercrombie,output_format_instructions=legalbench_abercrombie_no_prefix,stop=none", priority: 2}
{description: "legalbench:model=text_code,subset=corporate_lobbying,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2}
{description: "legalbench:model=text_code,subset=international_citizenship_questions,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2}
{description: "legalbench:model=text_code,subset=function_of_decision_section,output_format_instructions=legalbench_function_of_decision_section_no_prefix,stop=none", priority: 2}
{description: "legalbench:model=text_code,subset=proa,output_format_instructions=legalbench_yes_or_no_no_prefix,stop=none", priority: 2}

# MedQA
{description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2}

# # WMT14
{description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14", priority: 2}
{description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14", priority: 2}
{description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14", priority: 2}
{description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14", priority: 2}
{description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14", priority: 2}
]

23 changes: 17 additions & 6 deletions src/helm/benchmark/run_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -1426,14 +1426,20 @@ class OutputFormatInstructions(RunExpander):
name = "output_format_instructions"

_SUFFIX_SUFFIX = "_suffix"
_NO_PREFIX_SUFFIX = "_no_prefix"

def __init__(self, scenario: str):
self.suffix = False
if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX)
self.suffix = True
else:
self.scenario = scenario
self.suffix = False

self.no_prefix = False
if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX):
scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX)
self.no_prefix = True

self.scenario = scenario

def expand(self, run_spec: RunSpec) -> List[RunSpec]:
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
Expand All @@ -1454,6 +1460,8 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
)
elif self.scenario == "natural_qa":
instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
elif self.scenario == "natural_qa_short_answer":
instructions = "Answer with a short answer."
elif self.scenario == "legalbench":
if output_noun != "Answer":
instructions = f"Answer with the {output_noun.lower()}."
Expand All @@ -1469,8 +1477,6 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
instructions = "Answer with the English translation."
elif self.scenario == "wmt_14_only_last_sentence":
instructions = "Answer with only the English translation for the last sentence."
elif self.scenario == "wmt_14_no_prefix":
instructions = "Answer with the English translation. Do not include 'English:' in your answer."
elif self.scenario == "math":
instructions = "Wrap the final answer with the \\boxed{} command."
elif self.scenario == "numeric_nlg":
Expand All @@ -1487,6 +1493,11 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
else:
raise ValueError(f"Unknown scenario {self.scenario}")

if self.no_prefix:
if instructions:
instructions += " "
instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."

if self.suffix:
return [
replace(
Expand Down
Loading