Skip to content

Commit

Permalink
addressing comments
Browse files Browse the repository at this point in the history
  • Loading branch information
liamjxu committed Dec 14, 2024
1 parent 332fabb commit 75e3ded
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 52 deletions.
79 changes: 39 additions & 40 deletions src/helm/benchmark/annotation/bigcodebench_annotator.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
from typing import Any, List

import ast
import traceback
import time
import json

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.annotator import Annotator
from helm.common.request import Request
from helm.common.hierarchical_logger import hlog

from typing import Any, List
from gradio_client import Client, handle_file
from tempfile import TemporaryDirectory
from tenacity import retry, stop_after_attempt, wait_fixed

from helm.common.hierarchical_logger import hlog

import ast
import traceback
import time
import json
OUTPUT_FILENAME = "tmp_result.jsonl"


def syntax_check(code, verbose=False):
Expand Down Expand Up @@ -51,19 +55,32 @@ def __init__(self):
self.split = "instruct"
self.subset = "full"
self.pass_k = "1" # Original: "1,5,10"
self.is_macro = True
self.use_global_metric = True

def annotate(self, request_state: RequestState) -> Any:
pass

@retry(stop=stop_after_attempt(3), wait=wait_fixed(4))
def predict_with_retry(self, filename):
client = Client(self.remote_execute_api)
results, pass_at_k = client.predict(
split=self.split,
subset=self.subset,
samples=handle_file(filename),
pass_k=self.pass_k,
api_name="/predict",
)
results, pass_at_one = pass_at_k["pass@1"]
return results, pass_at_one


def annotate_all(self, request_states: List[RequestState]) -> Any:
assert all(request_state.result for request_state in request_states)
assert all(len(request_state.result.completions) == 1 for request_state in request_states)
assert all(request_state.instance.extra_data for request_state in request_states)

with TemporaryDirectory() as tmpdir:
# with open(f"{tmpdir}/result.jsonl", "w") as file:
with open(f"tmp_result.jsonl", "w") as file:
with open(OUTPUT_FILENAME, "w") as file:
res = []
for i in range(1140):
init_line = f'{{"task_id": "BigCodeBench/{i}", "solution": ""}}\n'
Expand All @@ -73,37 +90,19 @@ def annotate_all(self, request_states: List[RequestState]) -> Any:
model_output_text = request_state.result.completions[0].text
solution = code_extract(model_output_text)
escaped_solution = json.dumps(solution)[1:-1]
idx = int(request_state.instance.extra_data["task_id"].split("/")[-1])
res[idx] = (
f'{{"task_id": "{request_state.instance.extra_data["task_id"]}", "solution": "{escaped_solution}"}}\n'
)
idx = int(request_state.instance.id.split("/")[-1])
res[idx] = json.dumps(
{"task_id": request_state.instance.id, "solution": escaped_solution}
) + "\n"
for line in res:
file.write(line)

pass_at_one: float
max_retries = 3
retry_count = 0
success = False # Flag to indicate if the operation was successful
while retry_count < max_retries:
try:
client = Client(self.remote_execute_api)
results, pass_at_k = client.predict(
split=self.split,
subset=self.subset,
# samples=handle_file(f"{tmpdir}/result.jsonl"),
samples=handle_file(f"tmp_result.jsonl"),
pass_k=self.pass_k,
api_name="/predict",
)
success = True # Operation succeeded
pass_at_one = pass_at_k["pass@1"]
break
except Exception as e:
retry_count += 1
hlog(f"Attempt {retry_count} failed. Error Message: {e}. Retrying in 4s...")
time.sleep(4)
if not success:
hlog("Failed to complete the operation after 3 attempts.")
pass_at_one = 0.0

return {"pass_at_one": pass_at_one}
try:
results, pass_at_one = self.predict_with_retry(OUTPUT_FILENAME)
except Exception as e:
hlog("Failed to complete the operation after 3 attempts.")
pass_at_one = 0.0
results = []

ret = [{"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'} for state in request_states]
return ret
6 changes: 3 additions & 3 deletions src/helm/benchmark/annotation_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,14 @@ def execute(self, scenario_state: ScenarioState) -> ScenarioState:
hlog("No annotators to run.")
return scenario_state

if all(getattr(self.factory.get_annotator(spec), "is_macro", False) for spec in scenario_state.annotator_specs):
if all(getattr(self.factory.get_annotator(spec), "use_global_metric", False) for spec in scenario_state.annotator_specs):
# Do it!
request_states = self.process_all(
scenario_state.annotator_specs, scenario_state.request_states # processing all request together
)

else:
hlog("!!!!Annotators are not all is_macro!.")
hlog("!!!!Annotators are not all use_global_metric!.")

# Do it!
def do_it(request_state: RequestState) -> RequestState:
Expand Down Expand Up @@ -141,4 +141,4 @@ def process_all(self, annotator_specs: List[AnnotatorSpec], states: List[Request
annotations[annotator.name] = new_annotations
except Exception as e:
raise AnnotationExecutorError(f"{str(e)} Request: {states.request}") from e
return [replace(state, annotations=annotations) for state in states]
return [replace(state, annotations=new_annotations[idx]) for idx, state in enumerate(states)]
6 changes: 3 additions & 3 deletions src/helm/benchmark/run_specs/lite_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,18 +481,18 @@ def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec


@run_spec_function("bigcodebench")
def get_bigcodebench_spec(subset: str) -> RunSpec:
def get_bigcodebench_spec(version: str) -> RunSpec:

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"subset": subset}
class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version}
)

# Adapted from https://github.dev/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py
adapter_spec = AdapterSpec(
method=ADAPT_GENERATION,
input_prefix="",
output_prefix="",
max_tokens=1000,
max_tokens=1280,
num_outputs=1,
temperature=0.0,
global_prefix="Please provide a self-contained Python script that solves the following problem in a markdown code block:",
Expand Down
12 changes: 6 additions & 6 deletions src/helm/benchmark/scenarios/bigcodebench_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from helm.common.general import ensure_directory_exists


SUBSETS = ["v0.1.2"]
VERSIONS = ["v0.1.2"]


class BigCodeBenchScenario(Scenario):
Expand All @@ -25,20 +25,19 @@ class BigCodeBenchScenario(Scenario):
description = "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
tags = ["coding"]

def __init__(self, subset: str):
def __init__(self, version: str):
super().__init__()
assert subset in SUBSETS, "Unknown subset: {}".format(subset)
self.subset = subset
assert version in VERSIONS, "Unknown version: {}".format(version)
self.version = version

def get_instances(self, output_path: str) -> List[Instance]:
# Get BigCodeBench from HuggingFace
cache_dir = os.path.join(output_path, "data")
ensure_directory_exists(cache_dir)
dataset = datasets.load_dataset(
"bigcode/bigcodebench",
trust_remote_code=True,
cache_dir=cache_dir,
split="v0.1.2",
split=self.version,
)
assert isinstance(dataset, datasets.Dataset)

Expand All @@ -51,6 +50,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
input=input,
references=[],
split=TEST_SPLIT,
id=row['task_id'],
extra_data={"task_id": row["task_id"]},
)
instances.append(instance)
Expand Down

0 comments on commit 75e3ded

Please sign in to comment.