Skip to content

Commit

Permalink
Adding WildBench (#3150)
Browse files Browse the repository at this point in the history
  • Loading branch information
liamjxu authored Dec 13, 2024
1 parent 40b3d23 commit 1e55710
Show file tree
Hide file tree
Showing 14 changed files with 521 additions and 25 deletions.
1 change: 1 addition & 0 deletions src/helm/benchmark/adaptation/adapter_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

# Adaptation methods
ADAPT_GENERATION: str = "generation"
ADAPT_CHAT: str = "chat"
ADAPT_LANGUAGE_MODELING: str = "language_modeling"
ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
Expand Down
4 changes: 4 additions & 0 deletions src/helm/benchmark/adaptation/adapters/adapter_factory.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from helm.benchmark.adaptation.adapter_spec import (
ADAPT_GENERATION,
ADAPT_CHAT,
ADAPT_GENERATION_MULTIMODAL,
ADAPT_LANGUAGE_MODELING,
ADAPT_MULTIPLE_CHOICE_JOINT,
Expand All @@ -13,6 +14,7 @@
from helm.benchmark.adaptation.adapters.adapter import Adapter
from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
from helm.benchmark.adaptation.adapters.chat_adapter import ChatAdapter
from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
Expand All @@ -38,6 +40,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)

if method == ADAPT_GENERATION:
adapter = GenerationAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_CHAT:
adapter = ChatAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_LANGUAGE_MODELING:
adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
Expand Down
52 changes: 52 additions & 0 deletions src/helm/benchmark/adaptation/adapters/chat_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import List

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.scenarios.scenario import Instance
from helm.common.request import Request
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter


class ChatAdapter(InContextLearningAdapter):
"""
Each `Instance` in a `Scenario` has a history of the format:
[
{"role": "user", "content": <user-content>},
{"role": "assistant", "content": <assistant-content>},
{"role": "user", "content": <user-content>},
...
]
"""

def generate_requests(
self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
) -> List[RequestState]:
assert eval_instance.extra_data
messages = [
{"role": message["role"], "content": message["content"]}
for message in eval_instance.extra_data["conversation"]
]
request = Request(
model=self.adapter_spec.model,
model_deployment=self.adapter_spec.model_deployment,
messages=messages,
num_completions=self.adapter_spec.num_outputs,
temperature=self.adapter_spec.temperature,
max_tokens=self.adapter_spec.max_tokens,
stop_sequences=self.adapter_spec.stop_sequences,
random=self.adapter_spec.random,
image_generation_parameters=self.adapter_spec.image_generation_parameters,
)
request_state = RequestState(
instance=eval_instance,
reference_index=None,
request_mode=None,
train_trial_index=train_trial_index,
output_mapping=None,
request=request,
result=None,
num_train_instances=0,
prompt_truncated=False,
)
return [request_state]
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Instruction

You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models.
We will provide you with the user query and a pair of AI-generated responses (Response A and Response B).
You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.

# Conversation between User and AI

## History
<|begin_of_history|>

{$history}

<|end_of_history|>

## Current User Query
<|begin_of_query|>

{$user_query}

<|end_of_query|>

## Response A
<|begin_of_response_A|>

{$candidate_A}

<|end_of_response_A|>

## Response B
<|begin_of_response_B|>

{$candidate_B}

<|end_of_response_B|>

# Evaluation

## Checklist

<|begin_of_checklist|>

{$checklist}

<|end_of_checklist|>

Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.

## Rules

You should compare the above two responses based on your analysis of the user queries and the conversation history.
You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
There are five choices to give your final assessment: ["A++", "A+", "A=B", "B+", "B++"], which correspond to the following meanings:

- `A++`: Response A is much better than Response B.
- `A+`: Response A is only slightly better than Response B.
- `A=B`: Response A and B are of the same quality. Please use this choice sparingly.
- `B+`: Response B is only slightly better than Response A.
- `B++`: Response B is much better than Response A.


## Output Format
First, please output your analysis for each model response, and then summarize your assessment to three aspects: "reason A=B", "reason A>B", and "reason B>A", and finally make your choice for the final assessment.

Please provide your evaluation results in the following json format by filling in the placeholders in []:
```
{
"analysis of A": "[analysis of Response A]",
"analysis of B": "[analysis of Response B]",
"reason of A=B": "[where Response A and B perform equally well]",
"reason of A>B": "[where Response A is better than Response B]",
"reason of B>A": "[where Response B is better than Response A]",
"choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
66 changes: 66 additions & 0 deletions src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Instruction

You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
We will provide you with the user query and an AI-generated responses.
You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.

# Conversation between User and AI

## History
<|begin_of_history|>

{$history}

<|end_of_history|>

## Current User Query
<|begin_of_query|>

{$user_query}

<|end_of_query|>

## AI Response
<|begin_of_response|>

{$model_output}

<|end_of_response|>


# Evaluation

## Checklist

<|begin_of_checklist|>

{$checklist}

<|end_of_checklist|>

Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.

## Rules

You should compare the above response based on your analysis of the user queries and the conversation history.
You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
The scores are in the range of 1~10, where 1 means the response is very poor and 10 means the response is perfect.
Here are more detailed criteria for the scores:

- Score 1~2: The response is very poor and does not make sense at all.
- Score 3~4: The response is poor and does help user solve the problem in a meaningful way.
- Score 5~6: The response is fair but has some issues (e.g., factual errors, hallucinations, missing key information).
- Score 7~8: The response is good enough but could be improved in some ways.
- Score 9~10: The response is perfect and provides helpful information that can help user solve the problem.

## Output Format
First, please output your analysis for the model response, and then summarize your assessment to two aspects: "strengths" and "weaknesses"; Finally, please write down your rating for the assessment.

Please provide your evaluation results in the following json format by filling in the placeholders in []:
```
{
"strengths": "[analysis for the strengths of the response]",
"weaknesses": "[analysis for the weaknesses of the response]",
"score": "[1~10]"
}
```
63 changes: 63 additions & 0 deletions src/helm/benchmark/annotation/wildbench_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re
from typing import Any

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.annotator import Annotator
from helm.clients.auto_client import AutoClient
from helm.common.request import Request


class WildBenchAnnotator(Annotator):
"""The WildBench autograder."""

name = "wildbench"

def __init__(self, auto_client: AutoClient):
self._auto_client = auto_client
with open("src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md") as f:
self._score_template = f.read()
self._pattern = re.compile(
r'"strengths"\s*:\s*"(.*?)"\s*,\s*"weaknesses"\s*:\s*"(.*?)"\s*,\s*"score"\s*:\s*(".*?"|\d+)', re.DOTALL
)

def annotate(self, request_state: RequestState) -> Any:
assert request_state.result
assert len(request_state.result.completions) == 1
assert request_state.instance.extra_data
model_output_text = request_state.result.completions[0].text
if not model_output_text.strip():
# Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
return {"prompt_text": "", "strengths": "N/A", "weaknesses": "The model output is empty.", "score": 1.0}
prompt_template = self._score_template

annotator_prompt = (
prompt_template.replace("{$history}", request_state.instance.extra_data["history"])
.replace("{$user_query}", request_state.instance.extra_data["user_query"])
.replace("{$model_output}", model_output_text)
.replace("{$checklist}", "\n".join(request_state.instance.extra_data["checklist"]))
)
annotator_request = Request(
model="openai/gpt-4o-2024-05-13",
model_deployment="openai/gpt-4o-2024-05-13",
prompt=annotator_prompt,
temperature=0.0,
max_tokens=2000,
)
annotator_response = self._auto_client.make_request(annotator_request)
if not annotator_response.success:
raise Exception(f"Annotation request failed: {annotator_response.error}")
assert len(annotator_response.completions) == 1
annotator_response_text = annotator_response.completions[0].text
annotator_response_parts = self._pattern.search(annotator_response_text)
if not annotator_response_parts:
raise ValueError(f"Malformed annotator response: {annotator_response_text}")

strengths = annotator_response_parts[1].strip()
weaknesses = annotator_response_parts[2].strip()
score_text = annotator_response_parts[3].strip().strip('"')
try:
score = float(score_text)
except ValueError:
raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")

return {"prompt_text": annotator_prompt, "strengths": strengths, "weaknesses": weaknesses, "score": score}
9 changes: 8 additions & 1 deletion src/helm/benchmark/metrics/ifeval_metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List

from helm.common.hierarchical_logger import hlog
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.metrics.metric import Metric
Expand Down Expand Up @@ -40,7 +41,13 @@ def evaluate_generation(
if args and "prompt" in args:
instruction.build_description(prompt=prompt)

if response.strip() and instruction.check_following(response):
is_following = False
if response.strip():
try:
is_following = instruction.check_following(response)
except Exception as e:
hlog(f"WARNING: Instruction following checking failed with error message {e}")
if is_following:
is_following_list.append(1)
else:
is_following_list.append(0)
Expand Down
25 changes: 25 additions & 0 deletions src/helm/benchmark/metrics/wildbench_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import List

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.metrics.metric import Metric
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.statistic import Stat


class WildBenchScoreMetric(Metric):
"""Score metrics for WildBench."""

def evaluate_generation(
self,
adapter_spec: AdapterSpec,
request_state: RequestState,
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:
assert request_state.annotations
score = request_state.annotations["wildbench"]["score"]
return [
Stat(MetricName("wildbench_score")).add(score),
]
29 changes: 29 additions & 0 deletions src/helm/benchmark/run_specs/lite_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from helm.benchmark.adaptation.adapter_spec import (
ADAPT_GENERATION,
ADAPT_CHAT,
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
AdapterSpec,
Expand All @@ -26,6 +27,7 @@
from helm.benchmark.runner import get_benchmark_output_path
from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
from helm.benchmark.metrics.metric import MetricSpec
from helm.benchmark.annotation.annotator import AnnotatorSpec


@run_spec_function("narrative_qa")
Expand Down Expand Up @@ -449,3 +451,30 @@ def get_ifeval_spec() -> RunSpec:
metric_specs=metric_specs,
groups=["ifeval"],
)


@run_spec_function("wildbench")
def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec:

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario",
args={
"subset": subset,
"use_model_outputs": use_model_outputs == "True",
},
)

adapter_spec = AdapterSpec(
method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
)
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.wildbench_annotator.WildBenchAnnotator")]
metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.wildbench_metrics.WildBenchScoreMetric")]

return RunSpec(
name="wildbench",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
metric_specs=metric_specs,
groups=["wildbench"],
)
Loading

0 comments on commit 1e55710

Please sign in to comment.