diff --git a/src/helm/benchmark/run_specs/enem_challenge_specs.py b/src/helm/benchmark/run_specs/enem_challenge_specs.py new file mode 100644 index 0000000000..a06cf2ecee --- /dev/null +++ b/src/helm/benchmark/run_specs/enem_challenge_specs.py @@ -0,0 +1,31 @@ +from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT +from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec +from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.scenarios.scenario import ScenarioSpec + + +@run_spec_function("enem_challenge") +def get_enem_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={} + ) + + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. " + "Se as opções forem A, B, C, D e E, " + "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n" + "Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n" + "Resposta: B", + input_noun="Pergunta", + output_noun="Resposta", + ) + + return RunSpec( + name="enem_challenge", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["enem_challenge"], + ) diff --git a/src/helm/benchmark/scenarios/enem_challenge_scenario.py b/src/helm/benchmark/scenarios/enem_challenge_scenario.py new file mode 100644 index 0000000000..d05b295186 --- /dev/null +++ b/src/helm/benchmark/scenarios/enem_challenge_scenario.py @@ -0,0 +1,58 @@ +from typing import List, Any +from pathlib import Path +from datasets import load_dataset + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + CORRECT_TAG, + TEST_SPLIT, + Input, + Output, +) + + +class ENEMChallengeScenario(Scenario): + """ + The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied + every year by the Brazilian government to students that wish to undertake a University degree. + + The questions are about all types of intelectual fields and they are divided into four groups + that are named as: Humanities, Languages, Sciences and Mathematics. + + This scenario is based on the exams that were applied throughout the years of 2009 and 2023. + + The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge + """ + + name = "enem_challenge" + description = "ENEM Challenge dataset" + tags = ["knowledge", "multiple_choice", "pt-br"] + + def get_instances(self, output_path: str) -> List[Instance]: + # Download the raw data and read all the dialogues + dataset: Any + # Read all the instances + instances: List[Instance] = [] + cache_dir = str(Path(output_path) / "data") + + dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir) + for example in dataset["train"]: + question = example["question"] + choices = example["choices"] + answer = example["answerKey"] + # Skipping every canceled question! + if answer == "ANULADO": + continue + answers_dict = dict(zip(choices["label"], choices["text"])) + correct_answer = answers_dict[answer] + + def answer_to_reference(answer: str) -> Reference: + return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []) + + instance = Instance( + input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"])) + ) + instances.append(instance) + return instances diff --git a/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py b/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py new file mode 100644 index 0000000000..db2fc0cff8 --- /dev/null +++ b/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py @@ -0,0 +1,53 @@ +import pytest +from tempfile import TemporaryDirectory + +from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario +from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference + + +@pytest.mark.scenarios +def test_enem_challenge_scenario(): + enem_scenario = ENEMChallengeScenario() + with TemporaryDirectory() as tmpdir: + instances = enem_scenario.get_instances(tmpdir) + assert len(instances) == 1431 + assert instances[0].split == TEST_SPLIT + + assert instances[0].input.text.startswith( + "A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)" + ) + assert len(instances[0].input.text) == 1163 + + assert instances[0].references == [ + Reference( + output=Output( + text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. " # noqa: E501 + ), + tags=[], + ), + Reference( + output=Output( + text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. " # noqa: E501 + ), + tags=[], + ), + Reference( + output=Output( + text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. " # noqa: E501 + ), + tags=[CORRECT_TAG], + ), + Reference( + output=Output( + text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. " # noqa: E501 + ), + tags=[], + ), + Reference( + output=Output( + text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. " # noqa: E501 + ), + tags=[], + ), + ] + assert instances[0].references[2].is_correct diff --git a/src/helm/benchmark/static/schema_enem_challenge.yaml b/src/helm/benchmark/static/schema_enem_challenge.yaml new file mode 100644 index 0000000000..f329a2d104 --- /dev/null +++ b/src/helm/benchmark/static/schema_enem_challenge.yaml @@ -0,0 +1,146 @@ +############################################################ +metrics: + # Infrastructure metrics: + - name: num_perplexity_tokens + display_name: '# tokens' + description: Average number of tokens in the predicted output (for language modeling, the input too). + - name: num_bytes + display_name: '# bytes' + description: Average number of bytes in the predicted output (for language modeling, the input too). + + - name: num_references + display_name: '# ref' + description: Number of references. + - name: num_train_trials + display_name: '# trials' + description: Number of trials, where in each trial we choose an independent, random set of training instances. + - name: estimated_num_tokens_cost + display_name: 'cost' + description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request. + - name: num_prompt_tokens + display_name: '# prompt tokens' + description: Number of tokens in the prompt. + - name: num_prompt_characters + display_name: '# prompt chars' + description: Number of characters in the prompt. + - name: num_completion_tokens + display_name: '# completion tokens' + description: Actual number of completion tokens (over all completions). + - name: num_output_tokens + display_name: '# output tokens' + description: Actual number of output tokens. + - name: max_num_output_tokens + display_name: 'Max output tokens' + description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences). + - name: num_requests + display_name: '# requests' + description: Number of distinct API requests. + - name: num_instances + display_name: '# eval' + description: Number of evaluation instances. + - name: num_train_instances + display_name: '# train' + description: Number of training instances (e.g., in-context examples). + - name: prompt_truncated + display_name: truncated + description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples). + - name: finish_reason_length + display_name: finish b/c length + description: Fraction of instances where the the output was terminated because of the max tokens limit. + - name: finish_reason_stop + display_name: finish b/c stop + description: Fraction of instances where the the output was terminated because of the stop sequences. + - name: finish_reason_endoftext + display_name: finish b/c endoftext + description: Fraction of instances where the the output was terminated because the end of text token was generated. + - name: finish_reason_unknown + display_name: finish b/c unknown + description: Fraction of instances where the the output was terminated for unknown reasons. + - name: num_completions + display_name: '# completions' + description: Number of completions. + - name: predicted_index + display_name: Predicted index + description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice). + + # Accuracy metrics: + - name: exact_match + display_name: Exact match + short_display_name: EM + description: Fraction of instances that the predicted output matches a correct reference exactly. + lower_is_better: false + - name: quasi_exact_match + display_name: Quasi-exact match + short_display_name: EM + description: Fraction of instances that the predicted output matches a correct reference up to light processing. + lower_is_better: false + - name: prefix_exact_match + display_name: Prefix exact match + short_display_name: PEM + description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly. + lower_is_better: false + - name: quasi_prefix_exact_match + # TODO: should call this prefix_quasi_exact_match + display_name: Prefix quasi-exact match + short_display_name: PEM + description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing. + lower_is_better: false + + +############################################################ +perturbations: [] + +############################################################ +metric_groups: + - name: accuracy + display_name: Accuracy + metrics: + - name: ${main_name} + split: ${main_split} + + # - name: efficiency + # display_name: Efficiency + # metrics: + # - name: inference_runtime + # split: ${main_split} + + - name: general_information + display_name: General information + hide_win_rates: true + metrics: + - name: num_instances + split: ${main_split} + - name: num_train_instances + split: ${main_split} + - name: prompt_truncated + split: ${main_split} + - name: num_prompt_tokens + split: ${main_split} + - name: num_output_tokens + split: ${main_split} + +############################################################ +run_groups: + - name: core_scenarios + display_name: Core Scenarios + description: Core Scenarios + category: All scenarios + subgroups: + - enem_challenge + + - name: enem_challenge + display_name: ENEM Challenge + description: ENEM Challenge + metric_groups: + - accuracy + # - efficiency + - general_information + environment: + main_name: exact_match + main_split: test + taxonomy: + task: "multiple-choice question answering" + what: "general academic subjects" + who: "brazilian ministry of education" + when: "between 2009 and 2023" + language: Portuguese diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 95214d068b..6682845edd 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -2863,3 +2863,12 @@ model_deployments: class_name: "helm.clients.huggingface_client.HuggingFaceClient" args: pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base + + - name: huggingface/sabia-7b + model_name: maritaca-ai/sabia-7b + tokenizer_name: maritaca-ai/sabia-7b + max_sequence_length: 2048 + client_spec: + class_name: "helm.clients.huggingface_client.HuggingFaceClient" + args: + pretrained_model_name_or_path: maritaca-ai/sabia-7b diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index c3bb0f54b8..bfdf49ce5b 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -3490,3 +3490,13 @@ models: num_parameters: 1380000000 release: 2024-10-21 tags: [TEXT_MODEL_TAG] + + - name: maritaca-ai/sabia-7b + display_name: Sabia 7B + description: Sabia 7B + creator_organization_name: MARITACA-AI + access: open + num_parameters: 6740000000 + release_date: 2023-11-08 + tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml index ef7dda6d76..f9641c5be1 100644 --- a/src/helm/config/tokenizer_configs.yaml +++ b/src/helm/config/tokenizer_configs.yaml @@ -742,3 +742,11 @@ tokenizer_configs: pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base end_of_text_token: "" prefix_token: "" + + - name: maritaca-ai/sabia-7b + tokenizer_spec: + class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + args: + pretrained_model_name_or_path: maritaca-ai/sabia-7b + end_of_text_token: "" + prefix_token: "" \ No newline at end of file