Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding ENEM Challenge Scenario & Maritaca AI model (Sabiá 7B) #3185

Merged
30 changes: 30 additions & 0 deletions src/helm/benchmark/run_specs/enem_challenge_specs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.scenarios.scenario import ScenarioSpec

@run_spec_function("enem_challenge")
def get_enem_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={}
)

adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
"Se as opções forem A, B, C, D e E, "
"sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
"Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n"
"Resposta: B",
input_noun="Pergunta",
output_noun="Resposta",
)

return RunSpec(
name="enem_challenge",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["enem_challenge"],
)
62 changes: 62 additions & 0 deletions src/helm/benchmark/scenarios/enem_challenge_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import re
from typing import List, Any
from pathlib import Path
from datasets import load_dataset

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
CORRECT_TAG,
TEST_SPLIT,
Input,
Output,
)


class ENEMChallengeScenario(Scenario):
"""
The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied
every year by the Brazilian government to students that wish to undertake a University degree.

The questions are about all types of intelectual fields and they are divided into four groups
that are named as: Humanities, Languages, Sciences and Mathematics.

This scenario is based on the exams that were applied throughout the years of 2009 and 2023.

The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge
"""

name = "enem_challenge"
description = "ENEM Challenge dataset"
tags = ["knowledge", "multiple_choice", "pt-br"]

def __init__(self):
super().__init__()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: delete empty constructor


def get_instances(self, output_path: str) -> List[Instance]:
# Download the raw data and read all the dialogues
dataset: Any
# Read all the instances
instances: List[Instance] = []
cache_dir = str(Path(output_path) / "data")

dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir)
for example in dataset["train"]:
question = example["question"]
choices = example["choices"]
answer = example["answerKey"]
# Skipping every canceled question!
if answer == "ANULADO":
continue
answers_dict = dict(zip(choices["label"], choices["text"]))
correct_answer = answers_dict[answer]

def answer_to_reference(answer: str) -> Reference:
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])

instance = Instance(
input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
)
instances.append(instance)
return instances
26 changes: 26 additions & 0 deletions src/helm/benchmark/scenarios/test_enem_challenge_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest
from tempfile import TemporaryDirectory

from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference


@pytest.mark.scenarios
def test_enem_challenge_scenario():
enem_scenario = ENEMChallengeScenario(subset="test")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test failing with:

src/helm/benchmark/scenarios/test_enem_challenge_scenario.py:10: error: Unexpected keyword argument "subset" for "ENEMChallengeScenario"  [call-arg]

I think you just need to delete subset="test".

with TemporaryDirectory() as tmpdir:
instances = enem_scenario.get_instances(tmpdir)
assert len(instances) == 1431
assert instances[0].split == TEST_SPLIT

assert instances[0].input.text.startswith("A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)")
assert len(instances[0].input.text) == 1163

assert instances[0].references == [
Reference(output=Output(text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. "), tags=[]),
Reference(output=Output(text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. "), tags=[]),
Reference(output=Output(text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. "), tags=[CORRECT_TAG]),
Reference(output=Output(text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. "), tags=[]),
Reference(output=Output(text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. "), tags=[]),
]
assert instances[0].references[2].is_correct == True
146 changes: 146 additions & 0 deletions src/helm/benchmark/static/schema_enem_challenge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
############################################################
metrics:
# Infrastructure metrics:
- name: num_perplexity_tokens
display_name: '# tokens'
description: Average number of tokens in the predicted output (for language modeling, the input too).
- name: num_bytes
display_name: '# bytes'
description: Average number of bytes in the predicted output (for language modeling, the input too).

- name: num_references
display_name: '# ref'
description: Number of references.
- name: num_train_trials
display_name: '# trials'
description: Number of trials, where in each trial we choose an independent, random set of training instances.
- name: estimated_num_tokens_cost
display_name: 'cost'
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
- name: num_prompt_tokens
display_name: '# prompt tokens'
description: Number of tokens in the prompt.
- name: num_prompt_characters
display_name: '# prompt chars'
description: Number of characters in the prompt.
- name: num_completion_tokens
display_name: '# completion tokens'
description: Actual number of completion tokens (over all completions).
- name: num_output_tokens
display_name: '# output tokens'
description: Actual number of output tokens.
- name: max_num_output_tokens
display_name: 'Max output tokens'
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
- name: num_requests
display_name: '# requests'
description: Number of distinct API requests.
- name: num_instances
display_name: '# eval'
description: Number of evaluation instances.
- name: num_train_instances
display_name: '# train'
description: Number of training instances (e.g., in-context examples).
- name: prompt_truncated
display_name: truncated
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
- name: finish_reason_length
display_name: finish b/c length
description: Fraction of instances where the the output was terminated because of the max tokens limit.
- name: finish_reason_stop
display_name: finish b/c stop
description: Fraction of instances where the the output was terminated because of the stop sequences.
- name: finish_reason_endoftext
display_name: finish b/c endoftext
description: Fraction of instances where the the output was terminated because the end of text token was generated.
- name: finish_reason_unknown
display_name: finish b/c unknown
description: Fraction of instances where the the output was terminated for unknown reasons.
- name: num_completions
display_name: '# completions'
description: Number of completions.
- name: predicted_index
display_name: Predicted index
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).

# Accuracy metrics:
- name: exact_match
display_name: Exact match
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference exactly.
lower_is_better: false
- name: quasi_exact_match
display_name: Quasi-exact match
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
lower_is_better: false
- name: prefix_exact_match
display_name: Prefix exact match
short_display_name: PEM
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
lower_is_better: false
- name: quasi_prefix_exact_match
# TODO: should call this prefix_quasi_exact_match
display_name: Prefix quasi-exact match
short_display_name: PEM
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
lower_is_better: false


############################################################
perturbations: []

############################################################
metric_groups:
- name: accuracy
display_name: Accuracy
metrics:
- name: ${main_name}
split: ${main_split}

# - name: efficiency
# display_name: Efficiency
# metrics:
# - name: inference_runtime
# split: ${main_split}

- name: general_information
display_name: General information
hide_win_rates: true
metrics:
- name: num_instances
split: ${main_split}
- name: num_train_instances
split: ${main_split}
- name: prompt_truncated
split: ${main_split}
- name: num_prompt_tokens
split: ${main_split}
- name: num_output_tokens
split: ${main_split}

############################################################
run_groups:
- name: core_scenarios
display_name: Core Scenarios
description: Core Scenarios
category: All scenarios
subgroups:
- enem_challenge

- name: enem_challenge
display_name: ENEM Challenge
description: ENEM Challenge
metric_groups:
- accuracy
# - efficiency
- general_information
environment:
main_name: exact_match
main_split: test
taxonomy:
task: "multiple-choice question answering"
what: "general academic subjects"
who: "brazilian ministry of education"
when: "between 2009 and 2023"
language: Portuguese
9 changes: 9 additions & 0 deletions src/helm/config/model_deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2840,3 +2840,12 @@ model_deployments:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base

- name: huggingface/sabia-7b
model_name: maritaca-ai/sabia-7b
tokenizer_name: maritaca-ai/sabia-7b
max_sequence_length: 2048
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
pretrained_model_name_or_path: maritaca-ai/sabia-7b
11 changes: 10 additions & 1 deletion src/helm/config/model_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3459,4 +3459,13 @@ models:
access: open
num_parameters: 1380000000
release: 2024-10-21
tags: [TEXT_MODEL_TAG]
tags: [TEXT_MODEL_TAG]

- name: maritaca-ai/sabia-7b
display_name: Sabia 7B
description: Sabia 7B
creator_organization_name: MARITACA-AI
access: open
num_parameters: 6740000000
release_date: 2023-11-08
tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
10 changes: 9 additions & 1 deletion src/helm/config/tokenizer_configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -728,4 +728,12 @@ tokenizer_configs:
args:
pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
end_of_text_token: ""
prefix_token: ""
prefix_token: ""

- name: maritaca-ai/sabia-7b
tokenizer_spec:
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
args:
pretrained_model_name_or_path: maritaca-ai/sabia-7b
end_of_text_token: "</s>"
prefix_token: "<s>"
Loading