Adding WildBench (#3150)

stanford-crfm · Dec 13, 2024 · 1e55710 · 1e55710
1 parent 40b3d23
commit 1e55710
Show file tree

Hide file tree

Showing 14 changed files with 521 additions and 25 deletions.
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -6,6 +6,7 @@
 
 # Adaptation methods
 ADAPT_GENERATION: str = "generation"
+ADAPT_CHAT: str = "chat"
 ADAPT_LANGUAGE_MODELING: str = "language_modeling"
 ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
 ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"

diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py
@@ -1,5 +1,6 @@
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
+    ADAPT_CHAT,
     ADAPT_GENERATION_MULTIMODAL,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -13,6 +14,7 @@
 from helm.benchmark.adaptation.adapters.adapter import Adapter
 from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
 from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
+from helm.benchmark.adaptation.adapters.chat_adapter import ChatAdapter
 from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
 from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
 from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
@@ -38,6 +40,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
 
         if method == ADAPT_GENERATION:
             adapter = GenerationAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_CHAT:
+            adapter = ChatAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_LANGUAGE_MODELING:
             adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_JOINT:

diff --git a/src/helm/benchmark/adaptation/adapters/chat_adapter.py b/src/helm/benchmark/adaptation/adapters/chat_adapter.py
@@ -0,0 +1,52 @@
+from typing import List
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.scenarios.scenario import Instance
+from helm.common.request import Request
+from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
+
+
+class ChatAdapter(InContextLearningAdapter):
+    """
+    Each `Instance` in a `Scenario` has a history of the format:
+
+    [
+        {"role": "user", "content": <user-content>},
+        {"role": "assistant", "content": <assistant-content>},
+        {"role": "user", "content": <user-content>},
+        ...
+    ]
+
+    """
+
+    def generate_requests(
+        self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
+    ) -> List[RequestState]:
+        assert eval_instance.extra_data
+        messages = [
+            {"role": message["role"], "content": message["content"]}
+            for message in eval_instance.extra_data["conversation"]
+        ]
+        request = Request(
+            model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
+            messages=messages,
+            num_completions=self.adapter_spec.num_outputs,
+            temperature=self.adapter_spec.temperature,
+            max_tokens=self.adapter_spec.max_tokens,
+            stop_sequences=self.adapter_spec.stop_sequences,
+            random=self.adapter_spec.random,
+            image_generation_parameters=self.adapter_spec.image_generation_parameters,
+        )
+        request_state = RequestState(
+            instance=eval_instance,
+            reference_index=None,
+            request_mode=None,
+            train_trial_index=train_trial_index,
+            output_mapping=None,
+            request=request,
+            result=None,
+            num_train_instances=0,
+            prompt_truncated=False,
+        )
+        return [request_state]
diff --git a/src/helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md b/src/helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md
@@ -0,0 +1,75 @@
+# Instruction 
+
+You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models. 
+We will provide you with the user query and a pair of AI-generated responses (Response A and Response B). 
+You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.
+
+# Conversation between User and AI
+
+## History
+<|begin_of_history|>
+
+{$history}
+
+<|end_of_history|> 
+
+## Current User Query
+<|begin_of_query|>
+
+{$user_query}
+
+<|end_of_query|>
+
+## Response A
+<|begin_of_response_A|>
+
+{$candidate_A}
+
+<|end_of_response_A|>
+
+## Response B
+<|begin_of_response_B|>
+
+{$candidate_B}
+
+<|end_of_response_B|>
+
+# Evaluation   
+
+## Checklist 
+
+<|begin_of_checklist|>
+
+{$checklist}
+
+<|end_of_checklist|>
+
+Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.
+
+## Rules 
+
+You should compare the above two responses based on your analysis of the user queries and the conversation history.
+You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
+There are five choices to give your final assessment: ["A++", "A+", "A=B", "B+", "B++"], which correspond to the following meanings:
+
+- `A++`: Response A is much better than Response B.
+- `A+`: Response A is only slightly better than Response B.
+- `A=B`: Response A and B are of the same quality. Please use this choice sparingly.
+- `B+`: Response B is only slightly better than Response A.
+- `B++`: Response B is much better than Response A.
+
+
+## Output Format 
+First, please output your analysis for each model response, and then summarize your assessment to three aspects: "reason A=B", "reason A>B", and "reason B>A", and finally make your choice for the final assessment.
+
+Please provide your evaluation results in the following json format by filling in the placeholders in []:
+```
+{
+    "analysis of A": "[analysis of Response A]",
+    "analysis of B": "[analysis of Response B]",
+    "reason of A=B": "[where Response A and B perform equally well]",
+    "reason of A>B": "[where Response A is better than Response B]",
+    "reason of B>A": "[where Response B is better than Response A]",
+    "choice": "[A++ or A+ or A=B or B+ or B++]",
+}
+```
diff --git a/src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md b/src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md
@@ -0,0 +1,66 @@
+# Instruction 
+
+You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. 
+We will provide you with the user query and an AI-generated responses.
+You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.
+
+# Conversation between User and AI
+
+## History
+<|begin_of_history|>
+
+{$history}
+
+<|end_of_history|> 
+
+## Current User Query
+<|begin_of_query|>
+
+{$user_query}
+
+<|end_of_query|>
+
+## AI Response
+<|begin_of_response|>
+
+{$model_output}
+
+<|end_of_response|>
+
+
+# Evaluation   
+
+## Checklist 
+
+<|begin_of_checklist|>
+
+{$checklist}
+
+<|end_of_checklist|>
+
+Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.
+
+## Rules 
+
+You should compare the above response based on your analysis of the user queries and the conversation history.
+You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
+The scores are in the range of 1~10, where 1 means the response is very poor and 10 means the response is perfect.
+Here are more detailed criteria for the scores:
+
+- Score 1~2: The response is very poor and does not make sense at all.
+- Score 3~4: The response is poor and does help user solve the problem in a meaningful way.
+- Score 5~6: The response is fair but has some issues (e.g., factual errors, hallucinations, missing key information).
+- Score 7~8: The response is good enough but could be improved in some ways.
+- Score 9~10: The response is perfect and provides helpful information that can help user solve the problem.
+
+## Output Format 
+First, please output your analysis for the model response, and then summarize your assessment to two aspects: "strengths" and "weaknesses"; Finally, please write down your rating for the assessment.
+
+Please provide your evaluation results in the following json format by filling in the placeholders in []:
+```
+{
+    "strengths": "[analysis for the strengths of the response]",
+    "weaknesses": "[analysis for the weaknesses of the response]",
+    "score": "[1~10]"
+}
+```
diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py
@@ -0,0 +1,63 @@
+import re
+from typing import Any
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+
+
+class WildBenchAnnotator(Annotator):
+    """The WildBench autograder."""
+
+    name = "wildbench"
+
+    def __init__(self, auto_client: AutoClient):
+        self._auto_client = auto_client
+        with open("src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md") as f:
+            self._score_template = f.read()
+        self._pattern = re.compile(
+            r'"strengths"\s*:\s*"(.*?)"\s*,\s*"weaknesses"\s*:\s*"(.*?)"\s*,\s*"score"\s*:\s*(".*?"|\d+)', re.DOTALL
+        )
+
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        assert request_state.instance.extra_data
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            # Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
+            return {"prompt_text": "", "strengths": "N/A", "weaknesses": "The model output is empty.", "score": 1.0}
+        prompt_template = self._score_template
+
+        annotator_prompt = (
+            prompt_template.replace("{$history}", request_state.instance.extra_data["history"])
+            .replace("{$user_query}", request_state.instance.extra_data["user_query"])
+            .replace("{$model_output}", model_output_text)
+            .replace("{$checklist}", "\n".join(request_state.instance.extra_data["checklist"]))
+        )
+        annotator_request = Request(
+            model="openai/gpt-4o-2024-05-13",
+            model_deployment="openai/gpt-4o-2024-05-13",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=2000,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise ValueError(f"Malformed annotator response: {annotator_response_text}")
+
+        strengths = annotator_response_parts[1].strip()
+        weaknesses = annotator_response_parts[2].strip()
+        score_text = annotator_response_parts[3].strip().strip('"')
+        try:
+            score = float(score_text)
+        except ValueError:
+            raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")
+
+        return {"prompt_text": annotator_prompt, "strengths": strengths, "weaknesses": weaknesses, "score": score}
diff --git a/src/helm/benchmark/metrics/ifeval_metrics.py b/src/helm/benchmark/metrics/ifeval_metrics.py
@@ -1,5 +1,6 @@
 from typing import List
 
+from helm.common.hierarchical_logger import hlog
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.metric import Metric
@@ -40,7 +41,13 @@ def evaluate_generation(
             if args and "prompt" in args:
                 instruction.build_description(prompt=prompt)
 
-            if response.strip() and instruction.check_following(response):
+            is_following = False
+            if response.strip():
+                try:
+                    is_following = instruction.check_following(response)
+                except Exception as e:
+                    hlog(f"WARNING: Instruction following checking failed with error message {e}")
+            if is_following:
                 is_following_list.append(1)
             else:
                 is_following_list.append(0)

diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py
@@ -0,0 +1,25 @@
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+class WildBenchScoreMetric(Metric):
+    """Score metrics for WildBench."""
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        score = request_state.annotations["wildbench"]["score"]
+        return [
+            Stat(MetricName("wildbench_score")).add(score),
+        ]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -4,6 +4,7 @@
 
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
+    ADAPT_CHAT,
     ADAPT_MULTIPLE_CHOICE_JOINT,
     ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     AdapterSpec,
@@ -26,6 +27,7 @@
 from helm.benchmark.runner import get_benchmark_output_path
 from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
 from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
 
 
 @run_spec_function("narrative_qa")
@@ -449,3 +451,30 @@ def get_ifeval_spec() -> RunSpec:
         metric_specs=metric_specs,
         groups=["ifeval"],
     )
+
+
+@run_spec_function("wildbench")
+def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec:
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario",
+        args={
+            "subset": subset,
+            "use_model_outputs": use_model_outputs == "True",
+        },
+    )
+
+    adapter_spec = AdapterSpec(
+        method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.wildbench_annotator.WildBenchAnnotator")]
+    metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.wildbench_metrics.WildBenchScoreMetric")]
+
+    return RunSpec(
+        name="wildbench",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["wildbench"],
+    )