Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes for MMLU PRO with COT #3200

Merged
merged 30 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
fad62fd
Committing changes for COT metric
siyagoel Nov 11, 2024
89460ec
Changes for COT metrix
siyagoel Nov 11, 2024
a366e24
Changes to COT metric
siyagoel Nov 11, 2024
d676183
Changes to COT Metric
siyagoel Nov 14, 2024
de6b9b1
Changes made to file.
siyagoel Nov 14, 2024
6c09cbc
Changes made
siyagoel Nov 14, 2024
2e02fb7
Committing changes
siyagoel Nov 14, 2024
d039a9d
Changes committed
siyagoel Nov 14, 2024
af03185
orrect changes to metric
siyagoel Nov 14, 2024
d675da0
format changes
siyagoel Nov 14, 2024
16afbbe
changes
siyagoel Nov 14, 2024
4a8e167
Merge branch 'main' into siyagoel/cotmetric
siyagoel Nov 14, 2024
d367578
changes to file
siyagoel Nov 14, 2024
23968c2
changed format
siyagoel Nov 14, 2024
90ac194
changes to file by deleting
siyagoel Nov 14, 2024
7cfbb1c
reformat file
siyagoel Nov 14, 2024
c876828
changes in files for schema_lite_z2.yaml
siyagoel Nov 15, 2024
97a9aff
Changes to address comments
siyagoel Nov 15, 2024
6d5eb55
changes added based on comments
siyagoel Nov 15, 2024
1398ab2
MMLU Pro With Metric
siyagoel Nov 15, 2024
243057e
format changes to files
siyagoel Nov 15, 2024
0ce9bc9
Changes to most recent comments
siyagoel Dec 5, 2024
bd7edc1
changes for format
siyagoel Dec 5, 2024
c9b2082
changed the correctness metric
siyagoel Dec 5, 2024
616b30e
Adding new file changes
siyagoel Dec 6, 2024
d71d92d
Adding a new file
siyagoel Dec 6, 2024
33a65de
Changes to formatting
siyagoel Dec 6, 2024
4388c8e
Please enter the commit message for your changes. Lines starting
siyagoel Dec 6, 2024
ed10470
Changed schema file
siyagoel Dec 6, 2024
b09dd91
Changed test scenario to match changes
siyagoel Dec 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions src/helm/benchmark/metrics/chain_of_thought_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import re
from typing import List, Optional

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.metrics.metric import Metric
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.statistic import Stat


def extract_answer(output_text: str) -> Optional[str]:
"""
Extracts the answer from the output text using two exact regex patterns.
Returns None if no valid answer is found.

Args:
output_text (str): The text from which to extract the answer.

Returns:
Optional[str]: The extracted answer (A-J) if found, otherwise None.
"""
# First regex: Matches "answer is (A-J)" with optional parentheses
match = re.search(r"answer is \(?([A-J])\)?", output_text)
if match:
return match.group(1)

# Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
if match:
return match.group(1)

# If neither regex matches, return None
return None


class ChainOfThoughtMetric(Metric):
"""
This metric focuses on structured reasoning and the accuracy of extracted answers.
It compares model outputs against correct answers provided in a multiple-choice
format and returns a score indicating the correctness of the generated response.
"""

def evaluate_generation(
self,
adapter_spec: AdapterSpec,
request_state: RequestState,
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:
"""
Evaluate the generated output for chain-of-thought reasoning accuracy.

The method extracts the model's output, determines the correct answer
from the provided references, and compares the two to compute a binary score.

Args:
adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
request_state (RequestState): The state of the current request, including
the input instance, output results, and references.
metric_service (MetricService): A service used to compute metrics if needed.
eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.

Returns:
List[Stat]: A list containing a single `Stat` object with the correctness
score (1 for correct, 0 for incorrect) under the metric
name "chain_of_thought_correct".
"""
# Assert that completions exist if the result is not None
assert (
request_state.result is not None and request_state.result.completions
), "Request state result must have completions."

# Set output_text if the assertion passes
output_text = request_state.result.completions[0].text

# Extract the answer using the updated logic
extracted_answer = extract_answer(output_text)

# Find the correct answer from references by translating index to letter
correct_answer = None
for index, option in enumerate(request_state.instance.references):
if option.is_correct:
correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.)
break

# Raise an exception if no correct answer is found
if correct_answer is None:
raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")

# Compare extracted answer with the correct answer and compute the score
score = 1 if extracted_answer == correct_answer else 0
return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
188 changes: 100 additions & 88 deletions src/helm/benchmark/run_specs/lite_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
get_generative_harms_metric_specs,
get_generic_metric_specs,
get_open_ended_generation_metric_specs,
MetricSpec,
)
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.runner import get_benchmark_output_path
from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
from helm.benchmark.metrics.metric import MetricSpec


@run_spec_function("narrative_qa")
Expand Down Expand Up @@ -137,25 +137,59 @@ def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Ru


@run_spec_function("mmlu_pro")
def get_mmlu_pro_spec(subject: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.mmlu_pro.MMLUProScenario", args={"subject": subject}
)
def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
# Convert to bools and remove the str versions
use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
use_few_shot_bool: bool = use_few_shot == "True"
del use_chain_of_thought
del use_few_shot

adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
input_noun="Question",
output_noun="Answer",
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
)
max_train_instance_num = 5 if use_few_shot_bool else 0

return RunSpec(
name=f"mmlu_pro:subject={subject}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["mmlu_pro"],
)
if use_chain_of_thought_bool:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_train_instances=max_train_instance_num,
max_tokens=1000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
global_suffix=(
"Let’s think step by step. Based on your reasoning, what is the single, "
"most likely answer choice? Format your response as follows: "
'"The correct answer is (insert answer here)".'
),
)
return RunSpec(
name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs()
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
],
groups=["mmlu_pro"],
)
else:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
max_tokens=1000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
)
return RunSpec(
name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["mmlu_pro"],
)


@run_spec_function("gsm")
Expand Down Expand Up @@ -344,79 +378,57 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
)
max_train_instance_num = 5 if use_few_shot_bool else 0

if use_few_shot_bool:
if use_chain_of_thought_bool:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_tokens=1000, # following original repo
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
chain_of_thought_prefix="Let's think step by step: ",
chain_of_thought_suffix="The correct answer is ",
output_noun="", # will be overwritten with output_prefix
output_prefix="",
global_suffix=(
"Give step by step reasoning before you answer, and when you’re ready to answer, "
'please use the format "The correct answer is (insert answer here)":'
),
)
else:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
output_noun="", # will be overwritten with output_prefix
output_prefix="The correct answer is ",
)
if use_chain_of_thought_bool:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_tokens=1000, # following original repo
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
chain_of_thought_prefix="Let's think step by step: ",
chain_of_thought_suffix="The correct answer is ",
output_noun="", # will be overwritten with output_prefix
output_prefix="",
)
return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs()
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
],
groups=["gpqa"],
)
else:
if use_chain_of_thought_bool:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_train_instances=max_train_instance_num,
max_tokens=1000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
reference_prefix="(A) ",
global_suffix=(
"Let’s think step by step. Based on your reasoning, what is the single, "
"most likely answer choice? Format your response as follows: "
'"The correct answer is (insert answer here)".'
),
)
else:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
max_tokens=1000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
reference_prefix="(A) ",
global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
)
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
output_noun="", # will be overwritten with output_prefix
output_prefix="The correct answer is ",
)

return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(), # TODO: update this after cot metric is ready
groups=["gpqa"],
)
return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["gpqa"],
)


@run_spec_function("ifeval")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
from typing import Dict, List
from datasets import load_dataset
from datasets import Dataset, load_dataset

from helm.common.hierarchical_logger import hlog
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TRAIN_SPLIT,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)


class MMLUProScenario(Scenario):
Expand Down Expand Up @@ -33,7 +42,14 @@ def __init__(self, subject: str):
super().__init__()
self.subject: str = subject

def process_csv(self, data, split: str) -> List[Instance]:
def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
"""
Process the dataset to create instances.

:param data: Hugging Face `Dataset` containing the data for a specific split.
:param split: The data split (e.g., "train", "test").
:return: A list of processed `Instance` objects.
"""
instances: List[Instance] = []
hlog(f"Processing data for {split} split")
for row in data:
Expand All @@ -55,8 +71,14 @@ def answer_to_reference(answer: str) -> Reference:
return instances

def get_instances(self, output_path: str) -> List[Instance]:
"""
Load and process the MMLU-Pro dataset to create instances.

:param output_path: Path to save or output the processed instances.
:return: A list of all processed `Instance` objects.
"""
# Load the MMLU-Pro dataset from Hugging Face
dataset = load_dataset("TIGER-Lab/MMLU-Pro")
dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b")

# Process all the instances
instances: List[Instance] = []
Expand All @@ -66,6 +88,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
}
for hf_split, split in splits.items():
data = dataset[hf_split].filter(lambda x: x["category"] == self.subject)
instances.extend(self.process_csv(data, split))
instances.extend(self.process_dataset(data, split))

return instances
Loading
Loading