Skip to content

Commit

Permalink
addressed comments and tested
Browse files Browse the repository at this point in the history
  • Loading branch information
liamjxu committed Dec 15, 2024
1 parent 263f69b commit 70e7937
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 14 deletions.
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ metrics =
sacrebleu~=2.2.1 # For disinformation_metrics, machine_translation_metrics
langdetect~=1.0.9 # For ifeval_metrics
immutabledict~=4.2.0 # For ifeval_metrics
gradio_client==1.4.3 # For bigcodebench_metrics
gradio_client~=1.3 # For bigcodebench_metrics
tenacity~=9.0.0 # For bigcodebench_metrics

summarization =
summ-eval~=0.892 # For summarization_metrics
Expand Down
18 changes: 8 additions & 10 deletions src/helm/benchmark/annotation/bigcodebench_annotator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@

import ast
import traceback
import time
import json

from helm.benchmark.adaptation.request_state import RequestState
Expand Down Expand Up @@ -56,21 +54,22 @@ def __init__(self):
self.subset = "full"
self.pass_k = "1" # Original: "1,5,10"
self.use_global_metric = True
self.num_instances = 1140 # Instruct full seting of the dataset

def annotate(self, request_state: RequestState) -> Any:
pass

@retry(stop=stop_after_attempt(3), wait=wait_fixed(4))
def predict_with_retry(self, filename):
def predict_with_retry(self, filename: str):
client = Client(self.remote_execute_api)
results, pass_at_k = client.predict(
results, evals = client.predict(
split=self.split,
subset=self.subset,
samples=handle_file(filename),
pass_k=self.pass_k,
api_name="/predict",
)
results, pass_at_one = pass_at_k["pass@1"]
pass_at_one = evals["pass@1"]
return results, pass_at_one


Expand All @@ -82,17 +81,16 @@ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any
with TemporaryDirectory() as tmpdir:
with open(OUTPUT_FILENAME, "w") as file:
res = []
for i in range(1140):
for i in range(self.num_instances):
init_line = f'{{"task_id": "BigCodeBench/{i}", "solution": ""}}\n'
res.append(init_line)
for request_state in request_states:
line: str
model_output_text = request_state.result.completions[0].text
solution = code_extract(model_output_text)
escaped_solution = json.dumps(solution)[1:-1]
idx = int(request_state.instance.id.split("/")[-1])
res[idx] = json.dumps(
{"task_id": request_state.instance.id, "solution": escaped_solution}
{"task_id": request_state.instance.id, "solution": solution}
) + "\n"
for line in res:
file.write(line)
Expand All @@ -104,7 +102,7 @@ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any
pass_at_one = 0.0
results = []
if len(results):
ret = [{"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'} for state in request_states]
ret = [{'bigcodebench': {"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'}} for state in request_states]
else:
ret = [{"pass_at_one": False} for state in request_states]
ret = [{'bigcodebench': {"pass_at_one": False}} for state in request_states]
return ret
2 changes: 0 additions & 2 deletions src/helm/benchmark/annotation_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,6 @@ def execute(self, scenario_state: ScenarioState) -> ScenarioState:
)

else:
hlog("!!!!Annotators are not all use_global_metric!.")

# Do it!
def do_it(request_state: RequestState) -> RequestState:
assert scenario_state.annotator_specs is not None
Expand Down
2 changes: 1 addition & 1 deletion src/helm/benchmark/metrics/bigcodebench_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def evaluate_generation(
eval_cache_path: str,
) -> List[Stat]:
assert request_state.annotations
score = request_state.annotations["bigcodebench"]["pass_at_one"] * 1140 / 1000 # rescale to 0-1
score = request_state.annotations["bigcodebench"]["pass_at_one"]
return [
Stat(MetricName("bigcodebench_p@1")).add(score),
]

0 comments on commit 70e7937

Please sign in to comment.