Skip to content

Commit

Permalink
Adding ENEM Challenge Scenario & Maritaca AI model (Sabiá 7B) (#3185)
Browse files Browse the repository at this point in the history
Co-authored-by: Yifan Mai <[email protected]>
  • Loading branch information
thallysonjsa and yifanmai authored Dec 7, 2024
1 parent ff9c7c9 commit b8a140f
Show file tree
Hide file tree
Showing 7 changed files with 315 additions and 0 deletions.
31 changes: 31 additions & 0 deletions src/helm/benchmark/run_specs/enem_challenge_specs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.scenarios.scenario import ScenarioSpec


@run_spec_function("enem_challenge")
def get_enem_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={}
)

adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
"Se as opções forem A, B, C, D e E, "
"sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
"Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n"
"Resposta: B",
input_noun="Pergunta",
output_noun="Resposta",
)

return RunSpec(
name="enem_challenge",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["enem_challenge"],
)
58 changes: 58 additions & 0 deletions src/helm/benchmark/scenarios/enem_challenge_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from typing import List, Any
from pathlib import Path
from datasets import load_dataset

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
CORRECT_TAG,
TEST_SPLIT,
Input,
Output,
)


class ENEMChallengeScenario(Scenario):
"""
The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied
every year by the Brazilian government to students that wish to undertake a University degree.
The questions are about all types of intelectual fields and they are divided into four groups
that are named as: Humanities, Languages, Sciences and Mathematics.
This scenario is based on the exams that were applied throughout the years of 2009 and 2023.
The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge
"""

name = "enem_challenge"
description = "ENEM Challenge dataset"
tags = ["knowledge", "multiple_choice", "pt-br"]

def get_instances(self, output_path: str) -> List[Instance]:
# Download the raw data and read all the dialogues
dataset: Any
# Read all the instances
instances: List[Instance] = []
cache_dir = str(Path(output_path) / "data")

dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir)
for example in dataset["train"]:
question = example["question"]
choices = example["choices"]
answer = example["answerKey"]
# Skipping every canceled question!
if answer == "ANULADO":
continue
answers_dict = dict(zip(choices["label"], choices["text"]))
correct_answer = answers_dict[answer]

def answer_to_reference(answer: str) -> Reference:
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])

instance = Instance(
input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
)
instances.append(instance)
return instances
53 changes: 53 additions & 0 deletions src/helm/benchmark/scenarios/test_enem_challenge_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pytest
from tempfile import TemporaryDirectory

from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference


@pytest.mark.scenarios
def test_enem_challenge_scenario():
enem_scenario = ENEMChallengeScenario()
with TemporaryDirectory() as tmpdir:
instances = enem_scenario.get_instances(tmpdir)
assert len(instances) == 1431
assert instances[0].split == TEST_SPLIT

assert instances[0].input.text.startswith(
"A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)"
)
assert len(instances[0].input.text) == 1163

assert instances[0].references == [
Reference(
output=Output(
text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. " # noqa: E501
),
tags=[],
),
Reference(
output=Output(
text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. " # noqa: E501
),
tags=[],
),
Reference(
output=Output(
text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. " # noqa: E501
),
tags=[CORRECT_TAG],
),
Reference(
output=Output(
text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. " # noqa: E501
),
tags=[],
),
Reference(
output=Output(
text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. " # noqa: E501
),
tags=[],
),
]
assert instances[0].references[2].is_correct
146 changes: 146 additions & 0 deletions src/helm/benchmark/static/schema_enem_challenge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
############################################################
metrics:
# Infrastructure metrics:
- name: num_perplexity_tokens
display_name: '# tokens'
description: Average number of tokens in the predicted output (for language modeling, the input too).
- name: num_bytes
display_name: '# bytes'
description: Average number of bytes in the predicted output (for language modeling, the input too).

- name: num_references
display_name: '# ref'
description: Number of references.
- name: num_train_trials
display_name: '# trials'
description: Number of trials, where in each trial we choose an independent, random set of training instances.
- name: estimated_num_tokens_cost
display_name: 'cost'
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
- name: num_prompt_tokens
display_name: '# prompt tokens'
description: Number of tokens in the prompt.
- name: num_prompt_characters
display_name: '# prompt chars'
description: Number of characters in the prompt.
- name: num_completion_tokens
display_name: '# completion tokens'
description: Actual number of completion tokens (over all completions).
- name: num_output_tokens
display_name: '# output tokens'
description: Actual number of output tokens.
- name: max_num_output_tokens
display_name: 'Max output tokens'
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
- name: num_requests
display_name: '# requests'
description: Number of distinct API requests.
- name: num_instances
display_name: '# eval'
description: Number of evaluation instances.
- name: num_train_instances
display_name: '# train'
description: Number of training instances (e.g., in-context examples).
- name: prompt_truncated
display_name: truncated
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
- name: finish_reason_length
display_name: finish b/c length
description: Fraction of instances where the the output was terminated because of the max tokens limit.
- name: finish_reason_stop
display_name: finish b/c stop
description: Fraction of instances where the the output was terminated because of the stop sequences.
- name: finish_reason_endoftext
display_name: finish b/c endoftext
description: Fraction of instances where the the output was terminated because the end of text token was generated.
- name: finish_reason_unknown
display_name: finish b/c unknown
description: Fraction of instances where the the output was terminated for unknown reasons.
- name: num_completions
display_name: '# completions'
description: Number of completions.
- name: predicted_index
display_name: Predicted index
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).

# Accuracy metrics:
- name: exact_match
display_name: Exact match
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference exactly.
lower_is_better: false
- name: quasi_exact_match
display_name: Quasi-exact match
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
lower_is_better: false
- name: prefix_exact_match
display_name: Prefix exact match
short_display_name: PEM
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
lower_is_better: false
- name: quasi_prefix_exact_match
# TODO: should call this prefix_quasi_exact_match
display_name: Prefix quasi-exact match
short_display_name: PEM
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
lower_is_better: false


############################################################
perturbations: []

############################################################
metric_groups:
- name: accuracy
display_name: Accuracy
metrics:
- name: ${main_name}
split: ${main_split}

# - name: efficiency
# display_name: Efficiency
# metrics:
# - name: inference_runtime
# split: ${main_split}

- name: general_information
display_name: General information
hide_win_rates: true
metrics:
- name: num_instances
split: ${main_split}
- name: num_train_instances
split: ${main_split}
- name: prompt_truncated
split: ${main_split}
- name: num_prompt_tokens
split: ${main_split}
- name: num_output_tokens
split: ${main_split}

############################################################
run_groups:
- name: core_scenarios
display_name: Core Scenarios
description: Core Scenarios
category: All scenarios
subgroups:
- enem_challenge

- name: enem_challenge
display_name: ENEM Challenge
description: ENEM Challenge
metric_groups:
- accuracy
# - efficiency
- general_information
environment:
main_name: exact_match
main_split: test
taxonomy:
task: "multiple-choice question answering"
what: "general academic subjects"
who: "brazilian ministry of education"
when: "between 2009 and 2023"
language: Portuguese
9 changes: 9 additions & 0 deletions src/helm/config/model_deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2872,3 +2872,12 @@ model_deployments:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base

- name: huggingface/sabia-7b
model_name: maritaca-ai/sabia-7b
tokenizer_name: maritaca-ai/sabia-7b
max_sequence_length: 2048
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
pretrained_model_name_or_path: maritaca-ai/sabia-7b
10 changes: 10 additions & 0 deletions src/helm/config/model_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3499,3 +3499,13 @@ models:
num_parameters: 1380000000
release: 2024-10-21
tags: [TEXT_MODEL_TAG]

- name: maritaca-ai/sabia-7b
display_name: Sabia 7B
description: Sabia 7B
creator_organization_name: MARITACA-AI
access: open
num_parameters: 6740000000
release_date: 2023-11-08
tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]

8 changes: 8 additions & 0 deletions src/helm/config/tokenizer_configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -750,3 +750,11 @@ tokenizer_configs:
pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
end_of_text_token: ""
prefix_token: ""

- name: maritaca-ai/sabia-7b
tokenizer_spec:
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
args:
pretrained_model_name_or_path: maritaca-ai/sabia-7b
end_of_text_token: "</s>"
prefix_token: "<s>"

0 comments on commit b8a140f

Please sign in to comment.