-
Notifications
You must be signed in to change notification settings - Fork 265
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding ENEM Challenge Scenario & Maritaca AI model (Sabiá 7B) (#3185)
Co-authored-by: Yifan Mai <[email protected]>
- Loading branch information
1 parent
ff9c7c9
commit b8a140f
Showing
7 changed files
with
315 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT | ||
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec | ||
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs | ||
from helm.benchmark.run_spec import RunSpec, run_spec_function | ||
from helm.benchmark.scenarios.scenario import ScenarioSpec | ||
|
||
|
||
@run_spec_function("enem_challenge") | ||
def get_enem_spec() -> RunSpec: | ||
scenario_spec = ScenarioSpec( | ||
class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={} | ||
) | ||
|
||
adapter_spec = get_multiple_choice_adapter_spec( | ||
method=ADAPT_MULTIPLE_CHOICE_JOINT, | ||
instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. " | ||
"Se as opções forem A, B, C, D e E, " | ||
"sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n" | ||
"Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n" | ||
"Resposta: B", | ||
input_noun="Pergunta", | ||
output_noun="Resposta", | ||
) | ||
|
||
return RunSpec( | ||
name="enem_challenge", | ||
scenario_spec=scenario_spec, | ||
adapter_spec=adapter_spec, | ||
metric_specs=get_exact_match_metric_specs(), | ||
groups=["enem_challenge"], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from typing import List, Any | ||
from pathlib import Path | ||
from datasets import load_dataset | ||
|
||
from helm.benchmark.scenarios.scenario import ( | ||
Scenario, | ||
Instance, | ||
Reference, | ||
CORRECT_TAG, | ||
TEST_SPLIT, | ||
Input, | ||
Output, | ||
) | ||
|
||
|
||
class ENEMChallengeScenario(Scenario): | ||
""" | ||
The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied | ||
every year by the Brazilian government to students that wish to undertake a University degree. | ||
The questions are about all types of intelectual fields and they are divided into four groups | ||
that are named as: Humanities, Languages, Sciences and Mathematics. | ||
This scenario is based on the exams that were applied throughout the years of 2009 and 2023. | ||
The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge | ||
""" | ||
|
||
name = "enem_challenge" | ||
description = "ENEM Challenge dataset" | ||
tags = ["knowledge", "multiple_choice", "pt-br"] | ||
|
||
def get_instances(self, output_path: str) -> List[Instance]: | ||
# Download the raw data and read all the dialogues | ||
dataset: Any | ||
# Read all the instances | ||
instances: List[Instance] = [] | ||
cache_dir = str(Path(output_path) / "data") | ||
|
||
dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir) | ||
for example in dataset["train"]: | ||
question = example["question"] | ||
choices = example["choices"] | ||
answer = example["answerKey"] | ||
# Skipping every canceled question! | ||
if answer == "ANULADO": | ||
continue | ||
answers_dict = dict(zip(choices["label"], choices["text"])) | ||
correct_answer = answers_dict[answer] | ||
|
||
def answer_to_reference(answer: str) -> Reference: | ||
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []) | ||
|
||
instance = Instance( | ||
input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"])) | ||
) | ||
instances.append(instance) | ||
return instances |
53 changes: 53 additions & 0 deletions
53
src/helm/benchmark/scenarios/test_enem_challenge_scenario.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import pytest | ||
from tempfile import TemporaryDirectory | ||
|
||
from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario | ||
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference | ||
|
||
|
||
@pytest.mark.scenarios | ||
def test_enem_challenge_scenario(): | ||
enem_scenario = ENEMChallengeScenario() | ||
with TemporaryDirectory() as tmpdir: | ||
instances = enem_scenario.get_instances(tmpdir) | ||
assert len(instances) == 1431 | ||
assert instances[0].split == TEST_SPLIT | ||
|
||
assert instances[0].input.text.startswith( | ||
"A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)" | ||
) | ||
assert len(instances[0].input.text) == 1163 | ||
|
||
assert instances[0].references == [ | ||
Reference( | ||
output=Output( | ||
text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. " # noqa: E501 | ||
), | ||
tags=[], | ||
), | ||
Reference( | ||
output=Output( | ||
text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. " # noqa: E501 | ||
), | ||
tags=[], | ||
), | ||
Reference( | ||
output=Output( | ||
text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. " # noqa: E501 | ||
), | ||
tags=[CORRECT_TAG], | ||
), | ||
Reference( | ||
output=Output( | ||
text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. " # noqa: E501 | ||
), | ||
tags=[], | ||
), | ||
Reference( | ||
output=Output( | ||
text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. " # noqa: E501 | ||
), | ||
tags=[], | ||
), | ||
] | ||
assert instances[0].references[2].is_correct |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
############################################################ | ||
metrics: | ||
# Infrastructure metrics: | ||
- name: num_perplexity_tokens | ||
display_name: '# tokens' | ||
description: Average number of tokens in the predicted output (for language modeling, the input too). | ||
- name: num_bytes | ||
display_name: '# bytes' | ||
description: Average number of bytes in the predicted output (for language modeling, the input too). | ||
|
||
- name: num_references | ||
display_name: '# ref' | ||
description: Number of references. | ||
- name: num_train_trials | ||
display_name: '# trials' | ||
description: Number of trials, where in each trial we choose an independent, random set of training instances. | ||
- name: estimated_num_tokens_cost | ||
display_name: 'cost' | ||
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request. | ||
- name: num_prompt_tokens | ||
display_name: '# prompt tokens' | ||
description: Number of tokens in the prompt. | ||
- name: num_prompt_characters | ||
display_name: '# prompt chars' | ||
description: Number of characters in the prompt. | ||
- name: num_completion_tokens | ||
display_name: '# completion tokens' | ||
description: Actual number of completion tokens (over all completions). | ||
- name: num_output_tokens | ||
display_name: '# output tokens' | ||
description: Actual number of output tokens. | ||
- name: max_num_output_tokens | ||
display_name: 'Max output tokens' | ||
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences). | ||
- name: num_requests | ||
display_name: '# requests' | ||
description: Number of distinct API requests. | ||
- name: num_instances | ||
display_name: '# eval' | ||
description: Number of evaluation instances. | ||
- name: num_train_instances | ||
display_name: '# train' | ||
description: Number of training instances (e.g., in-context examples). | ||
- name: prompt_truncated | ||
display_name: truncated | ||
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples). | ||
- name: finish_reason_length | ||
display_name: finish b/c length | ||
description: Fraction of instances where the the output was terminated because of the max tokens limit. | ||
- name: finish_reason_stop | ||
display_name: finish b/c stop | ||
description: Fraction of instances where the the output was terminated because of the stop sequences. | ||
- name: finish_reason_endoftext | ||
display_name: finish b/c endoftext | ||
description: Fraction of instances where the the output was terminated because the end of text token was generated. | ||
- name: finish_reason_unknown | ||
display_name: finish b/c unknown | ||
description: Fraction of instances where the the output was terminated for unknown reasons. | ||
- name: num_completions | ||
display_name: '# completions' | ||
description: Number of completions. | ||
- name: predicted_index | ||
display_name: Predicted index | ||
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice). | ||
|
||
# Accuracy metrics: | ||
- name: exact_match | ||
display_name: Exact match | ||
short_display_name: EM | ||
description: Fraction of instances that the predicted output matches a correct reference exactly. | ||
lower_is_better: false | ||
- name: quasi_exact_match | ||
display_name: Quasi-exact match | ||
short_display_name: EM | ||
description: Fraction of instances that the predicted output matches a correct reference up to light processing. | ||
lower_is_better: false | ||
- name: prefix_exact_match | ||
display_name: Prefix exact match | ||
short_display_name: PEM | ||
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly. | ||
lower_is_better: false | ||
- name: quasi_prefix_exact_match | ||
# TODO: should call this prefix_quasi_exact_match | ||
display_name: Prefix quasi-exact match | ||
short_display_name: PEM | ||
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing. | ||
lower_is_better: false | ||
|
||
|
||
############################################################ | ||
perturbations: [] | ||
|
||
############################################################ | ||
metric_groups: | ||
- name: accuracy | ||
display_name: Accuracy | ||
metrics: | ||
- name: ${main_name} | ||
split: ${main_split} | ||
|
||
# - name: efficiency | ||
# display_name: Efficiency | ||
# metrics: | ||
# - name: inference_runtime | ||
# split: ${main_split} | ||
|
||
- name: general_information | ||
display_name: General information | ||
hide_win_rates: true | ||
metrics: | ||
- name: num_instances | ||
split: ${main_split} | ||
- name: num_train_instances | ||
split: ${main_split} | ||
- name: prompt_truncated | ||
split: ${main_split} | ||
- name: num_prompt_tokens | ||
split: ${main_split} | ||
- name: num_output_tokens | ||
split: ${main_split} | ||
|
||
############################################################ | ||
run_groups: | ||
- name: core_scenarios | ||
display_name: Core Scenarios | ||
description: Core Scenarios | ||
category: All scenarios | ||
subgroups: | ||
- enem_challenge | ||
|
||
- name: enem_challenge | ||
display_name: ENEM Challenge | ||
description: ENEM Challenge | ||
metric_groups: | ||
- accuracy | ||
# - efficiency | ||
- general_information | ||
environment: | ||
main_name: exact_match | ||
main_split: test | ||
taxonomy: | ||
task: "multiple-choice question answering" | ||
what: "general academic subjects" | ||
who: "brazilian ministry of education" | ||
when: "between 2009 and 2023" | ||
language: Portuguese |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters