stanford-crfm · yifanmai · Dec 20, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 27, 2024
diff --git a/src/helm/benchmark/run_specs/tweetsentbr_run_specs.py b/src/helm/benchmark/run_specs/tweetsentbr_run_specs.py
@@ -0,0 +1,32 @@
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+
+
+@run_spec_function("tweetsentbr")
+def get_tweetsentbr_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.tweetsentbr_scenario.TweetSentBRScenario", args={}
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions="""Classifique o tweet como "Positivo", "Neutro" ou "Negativo".
+
+        Tweet: vocês viram a novela hoje?
+        Classe: Neutro
+
+        Tweet: que vontade de comer pizza
+        Classe: Neutro
+        """,
+        input_noun="Tweet",
+        output_noun="Classe",
+    )
+
+    return RunSpec(
+        name="tweetsentbr",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        groups=["tweetsentbr"],
+    )
diff --git a/src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py b/src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py
@@ -0,0 +1,24 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.tweetsentbr_scenario import TweetSentBRScenario
+from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference
+
+
+@pytest.mark.scenarios
+def test_tweetsentbr_scenario():
+    tweetsentbr = TweetSentBRScenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = tweetsentbr.get_instances(tmpdir)
+    assert len(instances) == 2085
+    assert instances[0].split == TRAIN_SPLIT
+
+    assert instances[0].input.text.startswith("joca tá com a corda toda 😂 😂 😂 😂")
+    assert len(instances[0].input.text) == 32
+
+    assert instances[0].references == [
+        Reference(
+            output=Output(text="Positivo"),
+            tags=[CORRECT_TAG],
+        )
+    ]
diff --git a/src/helm/benchmark/scenarios/tweetsentbr_scenario.py b/src/helm/benchmark/scenarios/tweetsentbr_scenario.py
@@ -0,0 +1,66 @@
+from typing import Any, List, Dict
+from pathlib import Path
+from datasets import load_dataset
+from helm.common.hierarchical_logger import hlog
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+
+
+class TweetSentBRScenario(Scenario):
+    """
+    TweetSentBR is a corpus of Tweets in Brazilian Portuguese. It was labeled by several
+    annotators following steps stablished on the literature for improving reliability on
+    the task of Sentiment Analysis. Each Tweet was annotated in one of the three following classes:
+
+    Positive - tweets where a user meant a positive reaction or evaluation about the main topic on the post;
+    Negative - tweets where a user meant a negative reaction or evaluation about the main topic on the post;
+    Neutral - tweets not belonging to any of the last classes, usually not making a point, out of topic,
+    irrelevant, confusing or containing only objective data.
+
+    This dataset is a subset of the tweetSentBR, it contains only 75 samples from the training set
+    and all 2.000+ instances of the test set. This is meant for evaluating language models in a few-shot setting.
+    """
+
+    name = "simple_classification"
+    description = "Classify tweets into Positive, Negative or Neutral."
+    tags = ["classification"]
+
+    def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        label_names = {"Positive": "Positivo", "Negative": "Negativo", "Neutral": "Neutro"}
+        for example in dataset[split]:
+            input = Input(text=example["sentence"])
+            # NOTE: For classification scenarios, the reference outputs should be the same
+            # for all instances, and should include both correct and incorrect classes.
+            # HELM only supports single-label classification. Exactly one reference
+            # should have the CORRECT_TAG tag.
+            references = [
+                Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
+            ]
+            instance = Instance(input=input, references=references, split=split)
+            instances.append(instance)
+        return instances
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        cache_dir = str(Path(output_path) / "data")
+        dataset = load_dataset("eduagarcia/tweetsentbr_fewshot", cache_dir=cache_dir)
+        splits: Dict[str, str] = {
+            "train": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        for split in splits:
+            if split not in splits.keys():
+                hlog(f"{split} split doesn't exist, skipping")
+                continue
+            instances.extend(self.process_dataset(dataset, splits[split]))
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_tweetsentbr.yaml b/src/helm/benchmark/static/schema_tweetsentbr.yaml
@@ -0,0 +1,146 @@
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_exact_match
+    display_name: Quasi-exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference up to light processing.
+    lower_is_better: false
+  - name: prefix_exact_match
+    display_name: Prefix exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_prefix_exact_match
+    # TODO: should call this prefix_quasi_exact_match
+    display_name: Prefix quasi-exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
+    lower_is_better: false
+
+
+############################################################
+perturbations: []
+
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+
+  - name: efficiency
+    display_name: Efficiency
+    metrics:
+    - name: inference_runtime
+      split: ${main_split}
+
+  - name: general_information
+    display_name: General information
+    hide_win_rates: true
+    metrics:
+    - name: num_instances
+      split: ${main_split}
+    - name: num_train_instances
+      split: ${main_split}
+    - name: prompt_truncated
+      split: ${main_split}
+    - name: num_prompt_tokens
+      split: ${main_split}
+    - name: num_output_tokens
+      split: ${main_split}
+
+############################################################
+run_groups:
+  - name: core_scenarios
+    display_name: Core Scenarios
+    description: Core Scenarios
+    category: All scenarios
+    subgroups:
+      - tweetsentbr
+
+  - name: tweetsentbr
+    display_name: TweetSentBR
+    description: TweetSentBR
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: "text classification"
+      what: "tweets with sentiments"
+      who: "?"
+      when: "2018"
+      language: Portuguese