From 70e7937f6f9b90b61c40dd05b50fdcfec8bf13a5 Mon Sep 17 00:00:00 2001 From: Jialiang Xu Date: Sat, 14 Dec 2024 18:51:40 -0800 Subject: [PATCH] addressed comments and tested --- setup.cfg | 3 ++- .../annotation/bigcodebench_annotator.py | 18 ++++++++---------- src/helm/benchmark/annotation_executor.py | 2 -- .../benchmark/metrics/bigcodebench_metrics.py | 2 +- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/setup.cfg b/setup.cfg index 0b9b677b88..4a27690471 100644 --- a/setup.cfg +++ b/setup.cfg @@ -81,7 +81,8 @@ metrics = sacrebleu~=2.2.1 # For disinformation_metrics, machine_translation_metrics langdetect~=1.0.9 # For ifeval_metrics immutabledict~=4.2.0 # For ifeval_metrics - gradio_client==1.4.3 # For bigcodebench_metrics + gradio_client~=1.3 # For bigcodebench_metrics + tenacity~=9.0.0 # For bigcodebench_metrics summarization = summ-eval~=0.892 # For summarization_metrics diff --git a/src/helm/benchmark/annotation/bigcodebench_annotator.py b/src/helm/benchmark/annotation/bigcodebench_annotator.py index 1830eb6bb9..4fc936aeb8 100644 --- a/src/helm/benchmark/annotation/bigcodebench_annotator.py +++ b/src/helm/benchmark/annotation/bigcodebench_annotator.py @@ -1,7 +1,5 @@ - import ast import traceback -import time import json from helm.benchmark.adaptation.request_state import RequestState @@ -56,21 +54,22 @@ def __init__(self): self.subset = "full" self.pass_k = "1" # Original: "1,5,10" self.use_global_metric = True + self.num_instances = 1140 # Instruct full seting of the dataset def annotate(self, request_state: RequestState) -> Any: pass @retry(stop=stop_after_attempt(3), wait=wait_fixed(4)) - def predict_with_retry(self, filename): + def predict_with_retry(self, filename: str): client = Client(self.remote_execute_api) - results, pass_at_k = client.predict( + results, evals = client.predict( split=self.split, subset=self.subset, samples=handle_file(filename), pass_k=self.pass_k, api_name="/predict", ) - results, pass_at_one = pass_at_k["pass@1"] + pass_at_one = evals["pass@1"] return results, pass_at_one @@ -82,17 +81,16 @@ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any with TemporaryDirectory() as tmpdir: with open(OUTPUT_FILENAME, "w") as file: res = [] - for i in range(1140): + for i in range(self.num_instances): init_line = f'{{"task_id": "BigCodeBench/{i}", "solution": ""}}\n' res.append(init_line) for request_state in request_states: line: str model_output_text = request_state.result.completions[0].text solution = code_extract(model_output_text) - escaped_solution = json.dumps(solution)[1:-1] idx = int(request_state.instance.id.split("/")[-1]) res[idx] = json.dumps( - {"task_id": request_state.instance.id, "solution": escaped_solution} + {"task_id": request_state.instance.id, "solution": solution} ) + "\n" for line in res: file.write(line) @@ -104,7 +102,7 @@ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any pass_at_one = 0.0 results = [] if len(results): - ret = [{"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'} for state in request_states] + ret = [{'bigcodebench': {"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'}} for state in request_states] else: - ret = [{"pass_at_one": False} for state in request_states] + ret = [{'bigcodebench': {"pass_at_one": False}} for state in request_states] return ret diff --git a/src/helm/benchmark/annotation_executor.py b/src/helm/benchmark/annotation_executor.py index 4613214351..59dbf16075 100644 --- a/src/helm/benchmark/annotation_executor.py +++ b/src/helm/benchmark/annotation_executor.py @@ -99,8 +99,6 @@ def execute(self, scenario_state: ScenarioState) -> ScenarioState: ) else: - hlog("!!!!Annotators are not all use_global_metric!.") - # Do it! def do_it(request_state: RequestState) -> RequestState: assert scenario_state.annotator_specs is not None diff --git a/src/helm/benchmark/metrics/bigcodebench_metrics.py b/src/helm/benchmark/metrics/bigcodebench_metrics.py index 41ba70c796..7d80a328f2 100644 --- a/src/helm/benchmark/metrics/bigcodebench_metrics.py +++ b/src/helm/benchmark/metrics/bigcodebench_metrics.py @@ -19,7 +19,7 @@ def evaluate_generation( eval_cache_path: str, ) -> List[Stat]: assert request_state.annotations - score = request_state.annotations["bigcodebench"]["pass_at_one"] * 1140 / 1000 # rescale to 0-1 + score = request_state.annotations["bigcodebench"]["pass_at_one"] return [ Stat(MetricName("bigcodebench_p@1")).add(score), ]