addressed comments and tested

stanford-crfm · Dec 15, 2024 · 70e7937 · 70e7937
1 parent 263f69b
commit 70e7937
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 14 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -81,7 +81,8 @@ metrics =
     sacrebleu~=2.2.1  # For disinformation_metrics, machine_translation_metrics
     langdetect~=1.0.9  # For ifeval_metrics
     immutabledict~=4.2.0  # For ifeval_metrics
-    gradio_client==1.4.3  # For bigcodebench_metrics
+    gradio_client~=1.3  # For bigcodebench_metrics
+    tenacity~=9.0.0  # For bigcodebench_metrics
 
 summarization =
     summ-eval~=0.892  # For summarization_metrics

diff --git a/src/helm/benchmark/annotation/bigcodebench_annotator.py b/src/helm/benchmark/annotation/bigcodebench_annotator.py
@@ -1,7 +1,5 @@
-
 import ast
 import traceback
-import time
 import json
 
 from helm.benchmark.adaptation.request_state import RequestState
@@ -56,21 +54,22 @@ def __init__(self):
         self.subset = "full"
         self.pass_k = "1"  # Original: "1,5,10"
         self.use_global_metric = True
+        self.num_instances = 1140  # Instruct full seting of the dataset
 
     def annotate(self, request_state: RequestState) -> Any:
         pass
 
     @retry(stop=stop_after_attempt(3), wait=wait_fixed(4))
-    def predict_with_retry(self, filename):
+    def predict_with_retry(self, filename: str):
         client = Client(self.remote_execute_api)
-        results, pass_at_k = client.predict(
+        results, evals = client.predict(
             split=self.split,
             subset=self.subset,
             samples=handle_file(filename),
             pass_k=self.pass_k,
             api_name="/predict",
         )
-        results, pass_at_one = pass_at_k["pass@1"]
+        pass_at_one = evals["pass@1"]
         return results, pass_at_one
 
 
@@ -82,17 +81,16 @@ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any
         with TemporaryDirectory() as tmpdir:
             with open(OUTPUT_FILENAME, "w") as file:
                 res = []
-                for i in range(1140):
+                for i in range(self.num_instances):
                     init_line = f'{{"task_id": "BigCodeBench/{i}", "solution": ""}}\n'
                     res.append(init_line)
                 for request_state in request_states:
                     line: str
                     model_output_text = request_state.result.completions[0].text
                     solution = code_extract(model_output_text)
-                    escaped_solution = json.dumps(solution)[1:-1]
                     idx = int(request_state.instance.id.split("/")[-1])
                     res[idx] = json.dumps(
-                        {"task_id": request_state.instance.id, "solution": escaped_solution}
+                        {"task_id": request_state.instance.id, "solution": solution}
                     ) + "\n"
                 for line in res:
                     file.write(line)
@@ -104,7 +102,7 @@ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any
             pass_at_one = 0.0
             results = []
         if len(results):
-            ret = [{"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'} for state in request_states]
+            ret = [{'bigcodebench': {"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'}} for state in request_states]
         else:
-            ret = [{"pass_at_one": False} for state in request_states]
+            ret = [{'bigcodebench': {"pass_at_one": False}} for state in request_states]
         return ret
diff --git a/src/helm/benchmark/annotation_executor.py b/src/helm/benchmark/annotation_executor.py
@@ -99,8 +99,6 @@ def execute(self, scenario_state: ScenarioState) -> ScenarioState:
             )
 
         else:
-            hlog("!!!!Annotators are not all use_global_metric!.")
-
             # Do it!
             def do_it(request_state: RequestState) -> RequestState:
                 assert scenario_state.annotator_specs is not None

diff --git a/src/helm/benchmark/metrics/bigcodebench_metrics.py b/src/helm/benchmark/metrics/bigcodebench_metrics.py
@@ -19,7 +19,7 @@ def evaluate_generation(
         eval_cache_path: str,
     ) -> List[Stat]:
         assert request_state.annotations
-        score = request_state.annotations["bigcodebench"]["pass_at_one"] * 1140 / 1000  # rescale to 0-1
+        score = request_state.annotations["bigcodebench"]["pass_at_one"]
         return [
             Stat(MetricName("bigcodebench_p@1")).add(score),
         ]