Skip to content

Commit

Permalink
Changes to formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
siyagoel committed Dec 6, 2024
1 parent d71d92d commit 33a65de
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 51 deletions.
3 changes: 2 additions & 1 deletion src/helm/benchmark/metrics/basic_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat
derived_stats: List[Stat] = []
derived_stats.extend(compute_calibration_metrics(per_instance_stats))
return derived_stats



class BasicReferenceMetric(ReferenceMetric):
"""
Defines basic metrics for Scenarios that use one Request per Reference instead of
Expand Down
100 changes: 50 additions & 50 deletions src/helm/benchmark/run_specs/lite_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs()
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
],
groups=["mmlu_pro"],
)
Expand Down Expand Up @@ -380,56 +380,56 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
max_train_instance_num = 5 if use_few_shot_bool else 0

if use_chain_of_thought_bool:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_tokens=1000, # following original repo
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
chain_of_thought_prefix="Let's think step by step: ",
chain_of_thought_suffix="The correct answer is ",
output_noun="", # will be overwritten with output_prefix
output_prefix="",
)
return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs()
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
],
groups=["gpqa"],
)
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_tokens=1000, # following original repo
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
chain_of_thought_prefix="Let's think step by step: ",
chain_of_thought_suffix="The correct answer is ",
output_noun="", # will be overwritten with output_prefix
output_prefix="",
)
return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs()
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
],
groups=["gpqa"],
)
else:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
output_noun="", # will be overwritten with output_prefix
output_prefix="The correct answer is ",
)

return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["gpqa"],
)
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
output_noun="", # will be overwritten with output_prefix
output_prefix="The correct answer is ",
)

return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["gpqa"],
)


@run_spec_function("ifeval")
Expand Down

0 comments on commit 33a65de

Please sign in to comment.