From fad62fd07facb785e0300cea841ca332e6b6ed8e Mon Sep 17 00:00:00 2001 From: siyagoel Date: Mon, 11 Nov 2024 15:17:00 -0800 Subject: [PATCH 01/18] Committing changes for COT metric --- .../metrics/air_bench_metrics copy.py | 56 ++++++++++++ src/helm/benchmark/metrics/basic_metrics.py | 2 +- .../metrics/chain_of_thought_metric.py | 61 +++++++++++++ .../benchmark/run_specs/lite_run_specs.py | 90 ++++++++++++++++++- 4 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 src/helm/benchmark/metrics/air_bench_metrics copy.py create mode 100644 src/helm/benchmark/metrics/chain_of_thought_metric.py diff --git a/src/helm/benchmark/metrics/air_bench_metrics copy.py b/src/helm/benchmark/metrics/air_bench_metrics copy.py new file mode 100644 index 0000000000..97401b9978 --- /dev/null +++ b/src/helm/benchmark/metrics/air_bench_metrics copy.py @@ -0,0 +1,56 @@ +from typing import List + +from helm.benchmark.adaptation.adapter_spec import AdapterSpec +from helm.benchmark.adaptation.request_state import RequestState +from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics +from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric +from helm.benchmark.metrics.metric import Metric +from helm.benchmark.metrics.metric_name import MetricName +from helm.benchmark.metrics.metric_service import MetricService +from helm.benchmark.metrics.statistic import Stat + + +class AIRBench2024BasicGenerationMetric(Metric): + """Replacement for BasicGenerationMetric for AIRBench 2024. + + We call compute_request_state_metrics here because we can't use `BasicGenerationMetric` + because we abuse "references" to store metadata rather than true metadata.""" + + def __init__(self): + super().__init__() + self.efficiency_metric = EfficiencyMetric() + + def evaluate_generation( + self, + adapter_spec: AdapterSpec, + request_state: RequestState, + metric_service: MetricService, + eval_cache_path: str, + ) -> List[Stat]: + return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service) + + +class AIRBench2024ScoreMetric(Metric): + """Score metrics for AIRBench 2024.""" + + def evaluate_generation( + self, + adapter_spec: AdapterSpec, + request_state: RequestState, + metric_service: MetricService, + eval_cache_path: str, + ) -> List[Stat]: + assert len(request_state.instance.references) > 1 + category_text = request_state.instance.references[0].output.text + category_parts = category_text.split(".") + assert len(category_parts) == 3 + assert request_state.annotations + score = request_state.annotations["air_bench_2024"]["score"] + return [ + Stat(MetricName("air_score")).add(score), + Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score), + Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score), + Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add( + score + ), + ] diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py index 48c933f076..0e11ac24dd 100644 --- a/src/helm/benchmark/metrics/basic_metrics.py +++ b/src/helm/benchmark/metrics/basic_metrics.py @@ -181,7 +181,7 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat return derived_stats -class BasicReferenceMetric(ReferenceMetric): +class BasicGenerationMetric(ReferenceMetric): """ Defines basic metrics for Scenarios that use one Request per Reference instead of one per Instance. diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py new file mode 100644 index 0000000000..6e6830d9c7 --- /dev/null +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -0,0 +1,61 @@ +from typing import List + +from helm.benchmark.adaptation.adapter_spec import AdapterSpec +from helm.benchmark.adaptation.request_state import RequestState +from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics +from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric +from helm.benchmark.metrics.metric import Metric +from helm.benchmark.metrics.metric_name import MetricName +from helm.benchmark.metrics.metric_service import MetricService +from helm.benchmark.metrics.statistic import Stat + +import re +import random +from typing import List + + + +class ChainOfThoughtMetric(Metric): + """Replacement for BasicGenerationMetric for AIRBench 2024. + + We call compute_request_state_metrics here because we can't use `BasicGenerationMetric` + because we abuse "references" to store metadata rather than true metadata.""" + + + def evaluate_generation( + self, + adapter_spec: AdapterSpec, + request_state: RequestState, + metric_service: MetricService, + eval_cache_path: str, +) -> List[Stat]: + # Output from the model + output_text = request_state.result.completions[0].text + + # Initial regex pattern to match answer + match = re.search(r'answer is \(?([A-J])\)?', output_text) + + # Secondary regex pattern if the initial one fails + if not match: + match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text) + + # Fallback mechanism + if match: + extracted_answer = match.group(1) + else: + extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']) + + # Find the correct answer from references + correct_answer = None + for option in request_state.instance.references: + if option.get("is_correct"): + correct_answer = option.get("label") # Assuming 'label' holds the answer letter, e.g., "A", "B", etc. + break + + # Return the score in the specified format + score = 1 if extracted_answer == correct_answer else 0 + return [ + Stat(MetricName("chain_of_thought_correct")).add(score) + ] + + diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 99b111c804..49b905e9f5 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -25,6 +25,7 @@ from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.runner import get_benchmark_output_path from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path +from helm.benchmark.metrics.metric import MetricSpec; @run_spec_function("narrative_qa") @@ -413,6 +414,93 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs(), # TODO: update this after cot metric is ready + metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),], # TODO: update this after cot metric is ready + groups=["gpqa"], + ) + +@run_spec_function("gpqa") +def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: + # Convert to bools and remove the str versions + use_chain_of_thought_bool: bool = use_chain_of_thought == "True" + use_few_shot_bool: bool = use_few_shot == "True" + del use_chain_of_thought + del use_few_shot + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset} + ) + max_train_instance_num = 5 if use_few_shot_bool else 0 + + if use_few_shot_bool: + if use_chain_of_thought_bool: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_tokens=1000, # following original repo + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + chain_of_thought_prefix="Let's think step by step: ", + chain_of_thought_suffix="The correct answer is ", + output_noun="", # will be overwritten with output_prefix + output_prefix="", + global_suffix=( + "Give step by step reasoning before you answer, and when you’re ready to answer, " + 'please use the format "The correct answer is (insert answer here)":' + ), + ) + else: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + output_noun="", # will be overwritten with output_prefix + output_prefix="The correct answer is ", + ) + else: + if use_chain_of_thought_bool: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_train_instances=max_train_instance_num, + max_tokens=1000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + reference_prefix="(A) ", + global_suffix=( + "Let’s think step by step. Based on your reasoning, what is the single, " + "most likely answer choice? Format your response as follows: " + '"The correct answer is (insert answer here)".' + ), + ) + else: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + max_tokens=1000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + reference_prefix="(A) ", + global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), + ) + + return RunSpec( + name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),], # TODO: update this after cot metric is ready groups=["gpqa"], ) From 89460ec06d0f847cb731a045b7d3eecd6c1671a1 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Mon, 11 Nov 2024 15:53:39 -0800 Subject: [PATCH 02/18] Changes for COT metrix --- src/helm/benchmark/metrics/basic_metrics.py | 5 +- .../metrics/chain_of_thought_metric.py | 72 ++++++++++--------- .../benchmark/run_specs/lite_run_specs.py | 5 +- src/helm/benchmark/static/schema_lite_v2.yaml | 8 ++- 4 files changed, 48 insertions(+), 42 deletions(-) diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py index 0e11ac24dd..6031be068e 100644 --- a/src/helm/benchmark/metrics/basic_metrics.py +++ b/src/helm/benchmark/metrics/basic_metrics.py @@ -179,9 +179,8 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat derived_stats: List[Stat] = [] derived_stats.extend(compute_calibration_metrics(per_instance_stats)) return derived_stats - - -class BasicGenerationMetric(ReferenceMetric): + +class BasicReferenceMetric(ReferenceMetric): """ Defines basic metrics for Scenarios that use one Request per Reference instead of one per Instance. diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index 6e6830d9c7..fe8c5c982f 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -22,40 +22,42 @@ class ChainOfThoughtMetric(Metric): because we abuse "references" to store metadata rather than true metadata.""" - def evaluate_generation( - self, - adapter_spec: AdapterSpec, - request_state: RequestState, - metric_service: MetricService, - eval_cache_path: str, -) -> List[Stat]: - # Output from the model - output_text = request_state.result.completions[0].text - - # Initial regex pattern to match answer - match = re.search(r'answer is \(?([A-J])\)?', output_text) - - # Secondary regex pattern if the initial one fails - if not match: - match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text) - - # Fallback mechanism - if match: - extracted_answer = match.group(1) - else: - extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']) - - # Find the correct answer from references - correct_answer = None - for option in request_state.instance.references: - if option.get("is_correct"): - correct_answer = option.get("label") # Assuming 'label' holds the answer letter, e.g., "A", "B", etc. - break - - # Return the score in the specified format - score = 1 if extracted_answer == correct_answer else 0 - return [ - Stat(MetricName("chain_of_thought_correct")).add(score) - ] + def evaluate_generation( + self, + adapter_spec: AdapterSpec, + request_state: RequestState, + metric_service: MetricService, + eval_cache_path: str, + ) -> List[Stat]: + # Output from the model + output_text = request_state.result.completions[0].text + + # Initial regex pattern to match answer + match = re.search(r'answer is \(?([A-J])\)?', output_text) + + # Secondary regex pattern if the initial one fails + if not match: + match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text) + + # Fallback mechanism + if match: + extracted_answer = match.group(1) + else: + extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']) + + # Find the correct answer from references + correct_answer = None + + #option is an object with attributes + for option in request_state.instance.references: + if option.is_correct: + correct_answer = option # Assuming 'label' holds the answer letter, e.g., "A", "B", etc. + break + + # Return the score in the specified format + score = 1 if extracted_answer == correct_answer else 0 + return [ + Stat(MetricName("chain_of_thought_correct"), score) + ] diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 49b905e9f5..96ed4890cc 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -330,8 +330,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec groups=["wmt_14"], ) - -@run_spec_function("gpqa") +""" @run_spec_function("gpqa") def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: # Convert to bools and remove the str versions use_chain_of_thought_bool: bool = use_chain_of_thought == "True" @@ -416,7 +415,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot adapter_spec=adapter_spec, metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),], # TODO: update this after cot metric is ready groups=["gpqa"], - ) + )""" @run_spec_function("gpqa") def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml index 3f55d69cd2..d5f06655f9 100644 --- a/src/helm/benchmark/static/schema_lite_v2.yaml +++ b/src/helm/benchmark/static/schema_lite_v2.yaml @@ -88,6 +88,12 @@ metrics: short_display_name: PEM description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing. lower_is_better: false + - name: chain_of_thought_correct + # TODO: should call this prefix_quasi_exact_match + display_name: COT correct + short_display_name: COT correct + description: TBD. + lower_is_better: false ############################################################ perturbations: [] @@ -156,7 +162,7 @@ run_groups: - efficiency - general_information environment: - main_name: exact_match # non-CoT + main_name: chain_of_thought_correct # non-CoT main_split: test taxonomy: task: "?" From a366e24a1451762579331f0d826c0b051f3e04ad Mon Sep 17 00:00:00 2001 From: siyagoel Date: Mon, 11 Nov 2024 15:54:25 -0800 Subject: [PATCH 03/18] Changes to COT metric --- src/helm/benchmark/metrics/basic_metrics.py | 3 ++- .../metrics/chain_of_thought_metric.py | 24 +++++++------------ .../benchmark/run_specs/lite_run_specs.py | 9 +++++-- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py index 6031be068e..48c933f076 100644 --- a/src/helm/benchmark/metrics/basic_metrics.py +++ b/src/helm/benchmark/metrics/basic_metrics.py @@ -179,7 +179,8 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat derived_stats: List[Stat] = [] derived_stats.extend(compute_calibration_metrics(per_instance_stats)) return derived_stats - + + class BasicReferenceMetric(ReferenceMetric): """ Defines basic metrics for Scenarios that use one Request per Reference instead of diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index fe8c5c982f..aa9a6ebd6f 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -14,14 +14,12 @@ from typing import List - class ChainOfThoughtMetric(Metric): """Replacement for BasicGenerationMetric for AIRBench 2024. We call compute_request_state_metrics here because we can't use `BasicGenerationMetric` because we abuse "references" to store metadata rather than true metadata.""" - def evaluate_generation( self, adapter_spec: AdapterSpec, @@ -31,24 +29,24 @@ def evaluate_generation( ) -> List[Stat]: # Output from the model output_text = request_state.result.completions[0].text - + # Initial regex pattern to match answer - match = re.search(r'answer is \(?([A-J])\)?', output_text) - + match = re.search(r"answer is \(?([A-J])\)?", output_text) + # Secondary regex pattern if the initial one fails if not match: - match = re.search(r'\.\s*\[aA\]nswer:\s*\(?([A-J])\)?', output_text) - + match = re.search(r"\.\s*\[aA\]nswer:\s*\(?([A-J])\)?", output_text) + # Fallback mechanism if match: extracted_answer = match.group(1) else: - extracted_answer = random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']) - + extracted_answer = random.choice(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]) + # Find the correct answer from references correct_answer = None - #option is an object with attributes + # option is an object with attributes for option in request_state.instance.references: if option.is_correct: correct_answer = option # Assuming 'label' holds the answer letter, e.g., "A", "B", etc. @@ -56,8 +54,4 @@ def evaluate_generation( # Return the score in the specified format score = 1 if extracted_answer == correct_answer else 0 - return [ - Stat(MetricName("chain_of_thought_correct"), score) - ] - - + return [Stat(MetricName("chain_of_thought_correct"), score)] diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 96ed4890cc..18611a8850 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -25,7 +25,7 @@ from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.runner import get_benchmark_output_path from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path -from helm.benchmark.metrics.metric import MetricSpec; +from helm.benchmark.metrics.metric import MetricSpec @run_spec_function("narrative_qa") @@ -330,6 +330,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec groups=["wmt_14"], ) + """ @run_spec_function("gpqa") def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: # Convert to bools and remove the str versions @@ -417,6 +418,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot groups=["gpqa"], )""" + @run_spec_function("gpqa") def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: # Convert to bools and remove the str versions @@ -500,6 +502,9 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),], # TODO: update this after cot metric is ready + metric_specs=get_exact_match_metric_specs() + + [ + MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), + ], # TODO: update this after cot metric is ready groups=["gpqa"], ) From d676183888c00fc8dceb672c52eeafef1752fb35 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 21:09:47 -0800 Subject: [PATCH 04/18] Changes to COT Metric --- src/helm/benchmark/metrics/chain_of_thought_metric.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index aa9a6ebd6f..5527d3a20b 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -28,7 +28,11 @@ def evaluate_generation( eval_cache_path: str, ) -> List[Stat]: # Output from the model - output_text = request_state.result.completions[0].text + if request_state.result is not None and request_state.result.completions: + output_text = request_state.result.completions[0].text + else: + raise ValueError("Request result is None or completions is empty") + # Initial regex pattern to match answer match = re.search(r"answer is \(?([A-J])\)?", output_text) From de6b9b1781691bbcd1a1bbb1b382ce65d1e8d2a7 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 21:11:32 -0800 Subject: [PATCH 05/18] Changes made to file. --- src/helm/benchmark/metrics/chain_of_thought_metric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index 5527d3a20b..ddaa692ef2 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -33,7 +33,6 @@ def evaluate_generation( else: raise ValueError("Request result is None or completions is empty") - # Initial regex pattern to match answer match = re.search(r"answer is \(?([A-J])\)?", output_text) From 6c09cbc886424649837031302f93d30ebd0c2e23 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 21:14:55 -0800 Subject: [PATCH 06/18] Changes made --- src/helm/benchmark/metrics/chain_of_thought_metric.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index ddaa692ef2..a710d49252 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -2,8 +2,6 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics -from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName from helm.benchmark.metrics.metric_service import MetricService @@ -11,7 +9,6 @@ import re import random -from typing import List class ChainOfThoughtMetric(Metric): From 2e02fb7686a2e06f085e25516f6c67a9e3d54c10 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 21:17:13 -0800 Subject: [PATCH 07/18] Committing changes --- .../benchmark/run_specs/lite_run_specs.py | 89 ------------------- 1 file changed, 89 deletions(-) diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 18611a8850..d2b2b148d1 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -330,95 +330,6 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec groups=["wmt_14"], ) - -""" @run_spec_function("gpqa") -def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: - # Convert to bools and remove the str versions - use_chain_of_thought_bool: bool = use_chain_of_thought == "True" - use_few_shot_bool: bool = use_few_shot == "True" - del use_chain_of_thought - del use_few_shot - - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset} - ) - max_train_instance_num = 5 if use_few_shot_bool else 0 - - if use_few_shot_bool: - if use_chain_of_thought_bool: - adapter_spec = get_multiple_choice_adapter_spec( - method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, - max_tokens=1000, # following original repo - max_train_instances=max_train_instance_num, - instructions=( - "Here are some example questions from experts. " - "An explanation is given before the final answer. " - "Answer the final question yourself, giving your reasoning beforehand." - ), - input_noun="Question", - input_suffix="\nChoices: \n", - reference_prefix="(A) ", - chain_of_thought_prefix="Let's think step by step: ", - chain_of_thought_suffix="The correct answer is ", - output_noun="", # will be overwritten with output_prefix - output_prefix="", - global_suffix=( - "Give step by step reasoning before you answer, and when you’re ready to answer, " - 'please use the format "The correct answer is (insert answer here)":' - ), - ) - else: - adapter_spec = get_multiple_choice_adapter_spec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - max_train_instances=max_train_instance_num, - instructions=( - "Here are some example questions from experts. " - "An explanation is given before the final answer. " - "Answer the final question yourself, giving your reasoning beforehand." - ), - input_noun="Question", - input_suffix="\nChoices: \n", - reference_prefix="(A) ", - output_noun="", # will be overwritten with output_prefix - output_prefix="The correct answer is ", - ) - else: - if use_chain_of_thought_bool: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, - max_train_instances=max_train_instance_num, - max_tokens=1000, - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - reference_prefix="(A) ", - global_suffix=( - "Let’s think step by step. Based on your reasoning, what is the single, " - "most likely answer choice? Format your response as follows: " - '"The correct answer is (insert answer here)".' - ), - ) - else: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - max_train_instances=max_train_instance_num, - max_tokens=1000, - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - reference_prefix="(A) ", - global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), - ) - - return RunSpec( - name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs() + [MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),], # TODO: update this after cot metric is ready - groups=["gpqa"], - )""" - - @run_spec_function("gpqa") def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: # Convert to bools and remove the str versions From d039a9d354d0c6657f4943de2e2f1116916f9fe5 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 21:17:55 -0800 Subject: [PATCH 08/18] Changes committed --- src/helm/benchmark/run_specs/lite_run_specs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index d2b2b148d1..359ce157f2 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -330,6 +330,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec groups=["wmt_14"], ) + @run_spec_function("gpqa") def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: # Convert to bools and remove the str versions From af0318521ccb85a094ea369925770971b77b6c0f Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 23:29:23 -0800 Subject: [PATCH 09/18] orrect changes to metric --- .../metrics/chain_of_thought_metric.py | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index a710d49252..1ebbd15e26 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -11,11 +11,26 @@ import random -class ChainOfThoughtMetric(Metric): - """Replacement for BasicGenerationMetric for AIRBench 2024. +def extract_answer(output_text: str) -> str: + """ + Extracts the answer from the output text using two exact regex patterns. + Returns "N/A" if no valid answer is found. + """ + # First regex: Matches "answer is (A-J)" with optional parentheses + match = re.search(r"answer is \(?([A-J])\)?", output_text) + if match: + return match.group(1) + + # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "." + match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text) + if match: + return match.group(1) + + # If neither regex matches, return "N/A" + return "N/A" - We call compute_request_state_metrics here because we can't use `BasicGenerationMetric` - because we abuse "references" to store metadata rather than true metadata.""" +class ChainOfThoughtMetric(Metric): + """Replacement for BasicGenerationMetric for AIRBench 2024.""" def evaluate_generation( self, @@ -27,31 +42,17 @@ def evaluate_generation( # Output from the model if request_state.result is not None and request_state.result.completions: output_text = request_state.result.completions[0].text - else: - raise ValueError("Request result is None or completions is empty") - - # Initial regex pattern to match answer - match = re.search(r"answer is \(?([A-J])\)?", output_text) - - # Secondary regex pattern if the initial one fails - if not match: - match = re.search(r"\.\s*\[aA\]nswer:\s*\(?([A-J])\)?", output_text) + + # Extract the answer using the updated logic + extracted_answer = extract_answer(output_text) - # Fallback mechanism - if match: - extracted_answer = match.group(1) - else: - extracted_answer = random.choice(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]) - - # Find the correct answer from references + # Find the correct answer from references by translating index to letter correct_answer = None - - # option is an object with attributes - for option in request_state.instance.references: + for index, option in enumerate(request_state.instance.references): if option.is_correct: - correct_answer = option # Assuming 'label' holds the answer letter, e.g., "A", "B", etc. + correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.) break - # Return the score in the specified format score = 1 if extracted_answer == correct_answer else 0 - return [Stat(MetricName("chain_of_thought_correct"), score)] + # Return the score in the specified format + return [Stat(MetricName("chain_of_thought_correct"), score)] \ No newline at end of file From d675da0ae0e8fd737baec0a066dffe0d94a807ea Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 23:29:55 -0800 Subject: [PATCH 10/18] format changes --- src/helm/benchmark/metrics/chain_of_thought_metric.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index 1ebbd15e26..e1c6f53769 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -29,6 +29,7 @@ def extract_answer(output_text: str) -> str: # If neither regex matches, return "N/A" return "N/A" + class ChainOfThoughtMetric(Metric): """Replacement for BasicGenerationMetric for AIRBench 2024.""" @@ -42,7 +43,7 @@ def evaluate_generation( # Output from the model if request_state.result is not None and request_state.result.completions: output_text = request_state.result.completions[0].text - + # Extract the answer using the updated logic extracted_answer = extract_answer(output_text) @@ -55,4 +56,4 @@ def evaluate_generation( score = 1 if extracted_answer == correct_answer else 0 # Return the score in the specified format - return [Stat(MetricName("chain_of_thought_correct"), score)] \ No newline at end of file + return [Stat(MetricName("chain_of_thought_correct"), score)] From 16afbbeb2bf48abb637c8ead968522dfbd356336 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 23:31:01 -0800 Subject: [PATCH 11/18] changes --- src/helm/benchmark/metrics/chain_of_thought_metric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index e1c6f53769..a97bf95e5e 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -8,7 +8,6 @@ from helm.benchmark.metrics.statistic import Stat import re -import random def extract_answer(output_text: str) -> str: From d367578959f69d950982df4127d4ba30febca95e Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 23:55:44 -0800 Subject: [PATCH 12/18] changes to file --- .../metrics/air_bench_metrics copy.py | 56 ------------------- .../metrics/chain_of_thought_metric.py | 2 + 2 files changed, 2 insertions(+), 56 deletions(-) delete mode 100644 src/helm/benchmark/metrics/air_bench_metrics copy.py diff --git a/src/helm/benchmark/metrics/air_bench_metrics copy.py b/src/helm/benchmark/metrics/air_bench_metrics copy.py deleted file mode 100644 index 97401b9978..0000000000 --- a/src/helm/benchmark/metrics/air_bench_metrics copy.py +++ /dev/null @@ -1,56 +0,0 @@ -from typing import List - -from helm.benchmark.adaptation.adapter_spec import AdapterSpec -from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics -from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric -from helm.benchmark.metrics.metric import Metric -from helm.benchmark.metrics.metric_name import MetricName -from helm.benchmark.metrics.metric_service import MetricService -from helm.benchmark.metrics.statistic import Stat - - -class AIRBench2024BasicGenerationMetric(Metric): - """Replacement for BasicGenerationMetric for AIRBench 2024. - - We call compute_request_state_metrics here because we can't use `BasicGenerationMetric` - because we abuse "references" to store metadata rather than true metadata.""" - - def __init__(self): - super().__init__() - self.efficiency_metric = EfficiencyMetric() - - def evaluate_generation( - self, - adapter_spec: AdapterSpec, - request_state: RequestState, - metric_service: MetricService, - eval_cache_path: str, - ) -> List[Stat]: - return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service) - - -class AIRBench2024ScoreMetric(Metric): - """Score metrics for AIRBench 2024.""" - - def evaluate_generation( - self, - adapter_spec: AdapterSpec, - request_state: RequestState, - metric_service: MetricService, - eval_cache_path: str, - ) -> List[Stat]: - assert len(request_state.instance.references) > 1 - category_text = request_state.instance.references[0].output.text - category_parts = category_text.split(".") - assert len(category_parts) == 3 - assert request_state.annotations - score = request_state.annotations["air_bench_2024"]["score"] - return [ - Stat(MetricName("air_score")).add(score), - Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score), - Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score), - Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add( - score - ), - ] diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index a97bf95e5e..fd28dc47b9 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -52,6 +52,8 @@ def evaluate_generation( if option.is_correct: correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.) break + + print(request_state.instance.id, correct_answer, extracted_answer) score = 1 if extracted_answer == correct_answer else 0 # Return the score in the specified format From 23968c24612e8d3345be0b2cd6accf6f319602f1 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 23:56:42 -0800 Subject: [PATCH 13/18] changed format --- src/helm/benchmark/metrics/chain_of_thought_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index fd28dc47b9..c7d8560cba 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -52,7 +52,7 @@ def evaluate_generation( if option.is_correct: correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.) break - + print(request_state.instance.id, correct_answer, extracted_answer) score = 1 if extracted_answer == correct_answer else 0 From 90ac1947faca4d7b1bad81d5aae1bd7f372f20bf Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 23:59:06 -0800 Subject: [PATCH 14/18] changes to file by deleting --- src/helm/benchmark/run_specs/lite_run_specs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 552b80267f..bac947e3ca 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -20,8 +20,7 @@ get_f1_metric_specs, get_generative_harms_metric_specs, get_generic_metric_specs, - get_open_ended_generation_metric_specs, - MetricSpec, + get_open_ended_generation_metric_specs ) from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.runner import get_benchmark_output_path From 7cfbb1c2db1fd1f954ebd970a7882681556b8ed0 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Wed, 13 Nov 2024 23:59:48 -0800 Subject: [PATCH 15/18] reformat file --- src/helm/benchmark/run_specs/lite_run_specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index bac947e3ca..7fd40dcf91 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -20,7 +20,7 @@ get_f1_metric_specs, get_generative_harms_metric_specs, get_generic_metric_specs, - get_open_ended_generation_metric_specs + get_open_ended_generation_metric_specs, ) from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.runner import get_benchmark_output_path From c87682861b9b6f6d2a05d36ec7e8cb0b7e7c38bf Mon Sep 17 00:00:00 2001 From: siyagoel Date: Fri, 15 Nov 2024 02:46:23 -0800 Subject: [PATCH 16/18] changes in files for schema_lite_z2.yaml --- .../metrics/chain_of_thought_metric.py | 2 +- src/helm/benchmark/scenarios/mmlu_pro.py | 34 +++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index c7d8560cba..23eb3e65a0 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -57,4 +57,4 @@ def evaluate_generation( score = 1 if extracted_answer == correct_answer else 0 # Return the score in the specified format - return [Stat(MetricName("chain_of_thought_correct"), score)] + return [Stat(MetricName("chain_of_thought_correct")).add(score)] diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py b/src/helm/benchmark/scenarios/mmlu_pro.py index a091387dc2..4b50f4e9df 100644 --- a/src/helm/benchmark/scenarios/mmlu_pro.py +++ b/src/helm/benchmark/scenarios/mmlu_pro.py @@ -1,9 +1,17 @@ from typing import Dict, List -from datasets import load_dataset +from datasets import Dataset, load_dataset from helm.common.hierarchical_logger import hlog -from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output - +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) class MMLUProScenario(Scenario): """ @@ -33,7 +41,14 @@ def __init__(self, subject: str): super().__init__() self.subject: str = subject - def process_csv(self, data, split: str) -> List[Instance]: + def process_dataset(self, data: Dataset, split: str) -> List[Instance]: + """ + Process the dataset to create instances. + + :param data: Hugging Face `Dataset` containing the data for a specific split. + :param split: The data split (e.g., "train", "test"). + :return: A list of processed `Instance` objects. + """ instances: List[Instance] = [] hlog(f"Processing data for {split} split") for row in data: @@ -55,8 +70,14 @@ def answer_to_reference(answer: str) -> Reference: return instances def get_instances(self, output_path: str) -> List[Instance]: + """ + Load and process the MMLU-Pro dataset to create instances. + + :param output_path: Path to save or output the processed instances. + :return: A list of all processed `Instance` objects. + """ # Load the MMLU-Pro dataset from Hugging Face - dataset = load_dataset("TIGER-Lab/MMLU-Pro") + dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b") # Process all the instances instances: List[Instance] = [] @@ -66,6 +87,7 @@ def get_instances(self, output_path: str) -> List[Instance]: } for hf_split, split in splits.items(): data = dataset[hf_split].filter(lambda x: x["category"] == self.subject) - instances.extend(self.process_csv(data, split)) + instances.extend(self.process_dataset(data, split)) return instances + From 97a9affa745ded322611892f293703bc6da8b1ef Mon Sep 17 00:00:00 2001 From: siyagoel Date: Fri, 15 Nov 2024 03:11:31 -0800 Subject: [PATCH 17/18] Changes to address comments --- .../metrics/chain_of_thought_metric.py | 59 ++++++++++++++----- .../benchmark/run_specs/lite_run_specs.py | 2 +- src/helm/benchmark/static/schema_lite_v2.yaml | 28 ++++----- 3 files changed, 56 insertions(+), 33 deletions(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index 23eb3e65a0..67efdbb942 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -1,4 +1,5 @@ -from typing import List +import re +from typing import List, Optional from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.adaptation.request_state import RequestState @@ -7,13 +8,16 @@ from helm.benchmark.metrics.metric_service import MetricService from helm.benchmark.metrics.statistic import Stat -import re - - -def extract_answer(output_text: str) -> str: +def extract_answer(output_text: str) -> Optional[str]: """ Extracts the answer from the output text using two exact regex patterns. - Returns "N/A" if no valid answer is found. + Returns None if no valid answer is found. + + Args: + output_text (str): The text from which to extract the answer. + + Returns: + Optional[str]: The extracted answer (A-J) if found, otherwise None. """ # First regex: Matches "answer is (A-J)" with optional parentheses match = re.search(r"answer is \(?([A-J])\)?", output_text) @@ -25,12 +29,16 @@ def extract_answer(output_text: str) -> str: if match: return match.group(1) - # If neither regex matches, return "N/A" - return "N/A" + # If neither regex matches, return None + return None class ChainOfThoughtMetric(Metric): - """Replacement for BasicGenerationMetric for AIRBench 2024.""" + """ + This metric focuses on structured reasoning and the accuracy of extracted answers. + It compares model outputs against correct answers provided in a multiple-choice + format and returns a score indicating the correctness of the generated response. + """ def evaluate_generation( self, @@ -39,9 +47,30 @@ def evaluate_generation( metric_service: MetricService, eval_cache_path: str, ) -> List[Stat]: - # Output from the model - if request_state.result is not None and request_state.result.completions: - output_text = request_state.result.completions[0].text + """ + Evaluate the generated output for chain-of-thought reasoning accuracy. + + The method extracts the model's output, determines the correct answer + from the provided references, and compares the two to compute a binary score. + + Args: + adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation. + request_state (RequestState): The state of the current request, including + the input instance, output results, and references. + metric_service (MetricService): A service used to compute metrics if needed. + eval_cache_path (str): Path to the evaluation cache for storing or retrieving data. + + Returns: + List[Stat]: A list containing a single `Stat` object with the correctness + score (1 for correct, 0 for incorrect) under the metric + name "chain_of_thought_correct". + """ + # Assert that completions exist if the result is not None + assert request_state.result is not None and request_state.result.completions, \ + "Request state result must have completions." + + # Set output_text if the assertion passes + output_text = request_state.result.completions[0].text # Extract the answer using the updated logic extracted_answer = extract_answer(output_text) @@ -53,8 +82,10 @@ def evaluate_generation( correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.) break - print(request_state.instance.id, correct_answer, extracted_answer) + # Raise an exception if no correct answer is found + if correct_answer is None: + raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}") + # Compare extracted answer with the correct answer and compute the score score = 1 if extracted_answer == correct_answer else 0 - # Return the score in the specified format return [Stat(MetricName("chain_of_thought_correct")).add(score)] diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 7fd40dcf91..41ab3e1477 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -417,7 +417,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot metric_specs=get_exact_match_metric_specs() + [ MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), - ], # TODO: update this after cot metric is ready + ], groups=["gpqa"], ) diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml index 47730d2d68..b00b87e76f 100644 --- a/src/helm/benchmark/static/schema_lite_v2.yaml +++ b/src/helm/benchmark/static/schema_lite_v2.yaml @@ -88,6 +88,16 @@ metrics: short_display_name: PEM description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing. lower_is_better: false + - name: ifeval_strict_accuracy + display_name: IFEval strict accuracy + short_display_name: IFEval Strict Acc + description: Fraction of instructions in the instance that are correctly followed. + lower_is_better: false + - name: chain_of_thought_correct + display_name: COT correct + short_display_name: COT correct + description: TBD. + lower_is_better: false ############################################################ perturbations: [] @@ -130,7 +140,6 @@ run_groups: subgroups: - mmlu_pro - gpqa - - ifeval - name: mmlu_pro display_name: MMLU-Pro @@ -165,20 +174,3 @@ run_groups: who: "?" when: "?" language: English - - - name: ifeval - display_name: IFEval - description: IFEval - metric_groups: - - accuracy - - efficiency - - general_information - environment: - main_name: ifeval_strict_accuracy - main_split: test - taxonomy: - task: "?" - what: "?" - who: "?" - when: "?" - language: English From 6d5eb55f634bd2a4c8a921d17e74708e29a47641 Mon Sep 17 00:00:00 2001 From: siyagoel Date: Fri, 15 Nov 2024 03:12:13 -0800 Subject: [PATCH 18/18] changes added based on comments --- .../metrics/chain_of_thought_metric.py | 18 ++++++++++-------- src/helm/benchmark/run_specs/lite_run_specs.py | 2 +- src/helm/benchmark/scenarios/mmlu_pro.py | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py index 67efdbb942..32cfd880f3 100644 --- a/src/helm/benchmark/metrics/chain_of_thought_metric.py +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -8,6 +8,7 @@ from helm.benchmark.metrics.metric_service import MetricService from helm.benchmark.metrics.statistic import Stat + def extract_answer(output_text: str) -> Optional[str]: """ Extracts the answer from the output text using two exact regex patterns. @@ -35,8 +36,8 @@ def extract_answer(output_text: str) -> Optional[str]: class ChainOfThoughtMetric(Metric): """ - This metric focuses on structured reasoning and the accuracy of extracted answers. - It compares model outputs against correct answers provided in a multiple-choice + This metric focuses on structured reasoning and the accuracy of extracted answers. + It compares model outputs against correct answers provided in a multiple-choice format and returns a score indicating the correctness of the generated response. """ @@ -50,24 +51,25 @@ def evaluate_generation( """ Evaluate the generated output for chain-of-thought reasoning accuracy. - The method extracts the model's output, determines the correct answer + The method extracts the model's output, determines the correct answer from the provided references, and compares the two to compute a binary score. Args: adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation. - request_state (RequestState): The state of the current request, including + request_state (RequestState): The state of the current request, including the input instance, output results, and references. metric_service (MetricService): A service used to compute metrics if needed. eval_cache_path (str): Path to the evaluation cache for storing or retrieving data. Returns: - List[Stat]: A list containing a single `Stat` object with the correctness - score (1 for correct, 0 for incorrect) under the metric + List[Stat]: A list containing a single `Stat` object with the correctness + score (1 for correct, 0 for incorrect) under the metric name "chain_of_thought_correct". """ # Assert that completions exist if the result is not None - assert request_state.result is not None and request_state.result.completions, \ - "Request state result must have completions." + assert ( + request_state.result is not None and request_state.result.completions + ), "Request state result must have completions." # Set output_text if the assertion passes output_text = request_state.result.completions[0].text diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 41ab3e1477..e7c5ea8a83 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -417,7 +417,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot metric_specs=get_exact_match_metric_specs() + [ MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), - ], + ], groups=["gpqa"], ) diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py b/src/helm/benchmark/scenarios/mmlu_pro.py index 4b50f4e9df..5d08d4f9d1 100644 --- a/src/helm/benchmark/scenarios/mmlu_pro.py +++ b/src/helm/benchmark/scenarios/mmlu_pro.py @@ -13,6 +13,7 @@ Output, ) + class MMLUProScenario(Scenario): """ The MMLU-Pro dataset is an advanced version of the Massive Multitask Language Understanding (MMLU) @@ -90,4 +91,3 @@ def get_instances(self, output_path: str) -> List[Instance]: instances.extend(self.process_dataset(data, split)) return instances -