Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding TweetSentBR Scenario #3219

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions src/helm/benchmark/run_specs/tweetsentbr_run_specs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.scenarios.scenario import ScenarioSpec


@run_spec_function("tweetsentbr")
def get_tweetsentbr_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.tweetsentbr_scenario.TweetSentBRScenario", args={}
)

adapter_spec = get_generation_adapter_spec(
instructions="""Classifique o tweet como "Positivo", "Neutro" ou "Negativo".

Tweet: vocês viram a novela hoje?
Classe: Neutro

Tweet: que vontade de comer pizza
Classe: Neutro
""",
input_noun="Tweet",
output_noun="Classe",
)

return RunSpec(
name="tweetsentbr",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
groups=["tweetsentbr"],
)
24 changes: 24 additions & 0 deletions src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest
from tempfile import TemporaryDirectory

from helm.benchmark.scenarios.tweetsentbr_scenario import TweetSentBRScenario
from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference


@pytest.mark.scenarios
def test_tweetsentbr_scenario():
tweetsentbr = TweetSentBRScenario()
with TemporaryDirectory() as tmpdir:
instances = tweetsentbr.get_instances(tmpdir)
assert len(instances) == 2085
assert instances[0].split == TRAIN_SPLIT

assert instances[0].input.text.startswith("joca tá com a corda toda 😂 😂 😂 😂")
assert len(instances[0].input.text) == 32

assert instances[0].references == [
Reference(
output=Output(text="Positivo"),
tags=[CORRECT_TAG],
)
]
66 changes: 66 additions & 0 deletions src/helm/benchmark/scenarios/tweetsentbr_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from typing import Any, List, Dict
from pathlib import Path
from datasets import load_dataset
from helm.common.hierarchical_logger import hlog
from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TRAIN_SPLIT,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)


class TweetSentBRScenario(Scenario):
"""
TweetSentBR is a corpus of Tweets in Brazilian Portuguese. It was labeled by several
annotators following steps stablished on the literature for improving reliability on
the task of Sentiment Analysis. Each Tweet was annotated in one of the three following classes:

Positive - tweets where a user meant a positive reaction or evaluation about the main topic on the post;
Negative - tweets where a user meant a negative reaction or evaluation about the main topic on the post;
Neutral - tweets not belonging to any of the last classes, usually not making a point, out of topic,
irrelevant, confusing or containing only objective data.

This dataset is a subset of the tweetSentBR, it contains only 75 samples from the training set
and all 2.000+ instances of the test set. This is meant for evaluating language models in a few-shot setting.
"""

name = "simple_classification"
description = "Classify tweets into Positive, Negative or Neutral."
tags = ["classification"]

def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
instances: List[Instance] = []
label_names = {"Positive": "Positivo", "Negative": "Negativo", "Neutral": "Neutro"}
for example in dataset[split]:
input = Input(text=example["sentence"])
# NOTE: For classification scenarios, the reference outputs should be the same
# for all instances, and should include both correct and incorrect classes.
# HELM only supports single-label classification. Exactly one reference
# should have the CORRECT_TAG tag.
references = [
Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
]
instance = Instance(input=input, references=references, split=split)
instances.append(instance)
return instances

def get_instances(self, output_path: str) -> List[Instance]:
instances: List[Instance] = []
cache_dir = str(Path(output_path) / "data")
dataset = load_dataset("eduagarcia/tweetsentbr_fewshot", cache_dir=cache_dir)
splits: Dict[str, str] = {
"train": TRAIN_SPLIT,
"test": TEST_SPLIT,
}
for split in splits:
if split not in splits.keys():
hlog(f"{split} split doesn't exist, skipping")
continue
instances.extend(self.process_dataset(dataset, splits[split]))

return instances
146 changes: 146 additions & 0 deletions src/helm/benchmark/static/schema_tweetsentbr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
############################################################
metrics:
# Infrastructure metrics:
- name: num_perplexity_tokens
display_name: '# tokens'
description: Average number of tokens in the predicted output (for language modeling, the input too).
- name: num_bytes
display_name: '# bytes'
description: Average number of bytes in the predicted output (for language modeling, the input too).

- name: num_references
display_name: '# ref'
description: Number of references.
- name: num_train_trials
display_name: '# trials'
description: Number of trials, where in each trial we choose an independent, random set of training instances.
- name: estimated_num_tokens_cost
display_name: 'cost'
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
- name: num_prompt_tokens
display_name: '# prompt tokens'
description: Number of tokens in the prompt.
- name: num_prompt_characters
display_name: '# prompt chars'
description: Number of characters in the prompt.
- name: num_completion_tokens
display_name: '# completion tokens'
description: Actual number of completion tokens (over all completions).
- name: num_output_tokens
display_name: '# output tokens'
description: Actual number of output tokens.
- name: max_num_output_tokens
display_name: 'Max output tokens'
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
- name: num_requests
display_name: '# requests'
description: Number of distinct API requests.
- name: num_instances
display_name: '# eval'
description: Number of evaluation instances.
- name: num_train_instances
display_name: '# train'
description: Number of training instances (e.g., in-context examples).
- name: prompt_truncated
display_name: truncated
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
- name: finish_reason_length
display_name: finish b/c length
description: Fraction of instances where the the output was terminated because of the max tokens limit.
- name: finish_reason_stop
display_name: finish b/c stop
description: Fraction of instances where the the output was terminated because of the stop sequences.
- name: finish_reason_endoftext
display_name: finish b/c endoftext
description: Fraction of instances where the the output was terminated because the end of text token was generated.
- name: finish_reason_unknown
display_name: finish b/c unknown
description: Fraction of instances where the the output was terminated for unknown reasons.
- name: num_completions
display_name: '# completions'
description: Number of completions.
- name: predicted_index
display_name: Predicted index
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).

# Accuracy metrics:
- name: exact_match
display_name: Exact match
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference exactly.
lower_is_better: false
- name: quasi_exact_match
display_name: Quasi-exact match
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
lower_is_better: false
- name: prefix_exact_match
display_name: Prefix exact match
short_display_name: PEM
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
lower_is_better: false
- name: quasi_prefix_exact_match
# TODO: should call this prefix_quasi_exact_match
display_name: Prefix quasi-exact match
short_display_name: PEM
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
lower_is_better: false


############################################################
perturbations: []

############################################################
metric_groups:
- name: accuracy
display_name: Accuracy
metrics:
- name: ${main_name}
split: ${main_split}

- name: efficiency
display_name: Efficiency
metrics:
- name: inference_runtime
split: ${main_split}

- name: general_information
display_name: General information
hide_win_rates: true
metrics:
- name: num_instances
split: ${main_split}
- name: num_train_instances
split: ${main_split}
- name: prompt_truncated
split: ${main_split}
- name: num_prompt_tokens
split: ${main_split}
- name: num_output_tokens
split: ${main_split}

############################################################
run_groups:
- name: core_scenarios
display_name: Core Scenarios
description: Core Scenarios
category: All scenarios
subgroups:
- tweetsentbr

- name: tweetsentbr
display_name: TweetSentBR
description: TweetSentBR
metric_groups:
- accuracy
- efficiency
- general_information
environment:
main_name: exact_match
main_split: test
taxonomy:
task: "text classification"
what: "tweets with sentiments"
who: "?"
when: "2018"
language: Portuguese
Loading