Adding ENEM Challenge Scenario & Maritaca AI model (Sabiá 7B) (#3185)

Co-authored-by: Yifan Mai <[email protected]>
stanford-crfm · Dec 7, 2024 · b8a140f · b8a140f
1 parent ff9c7c9
commit b8a140f
Show file tree

Hide file tree

Showing 7 changed files with 315 additions and 0 deletions.
diff --git a/src/helm/benchmark/run_specs/enem_challenge_specs.py b/src/helm/benchmark/run_specs/enem_challenge_specs.py
@@ -0,0 +1,31 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+
+
+@run_spec_function("enem_challenge")
+def get_enem_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={}
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
+        "Se as opções forem A, B, C, D e E, "
+        "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
+        "Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n"
+        "Resposta: B",
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+
+    return RunSpec(
+        name="enem_challenge",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["enem_challenge"],
+    )
diff --git a/src/helm/benchmark/scenarios/enem_challenge_scenario.py b/src/helm/benchmark/scenarios/enem_challenge_scenario.py
@@ -0,0 +1,58 @@
+from typing import List, Any
+from pathlib import Path
+from datasets import load_dataset
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
+
+
+class ENEMChallengeScenario(Scenario):
+    """
+    The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied
+    every year by the Brazilian government to students that wish to undertake a University degree.
+
+    The questions are about all types of intelectual fields and they are divided into four groups
+    that are named as: Humanities, Languages, Sciences and Mathematics.
+
+    This scenario is based on the exams that were applied throughout the years of 2009 and 2023.
+
+    The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge
+    """
+
+    name = "enem_challenge"
+    description = "ENEM Challenge dataset"
+    tags = ["knowledge", "multiple_choice", "pt-br"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data and read all the dialogues
+        dataset: Any
+        # Read all the instances
+        instances: List[Instance] = []
+        cache_dir = str(Path(output_path) / "data")
+
+        dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir)
+        for example in dataset["train"]:
+            question = example["question"]
+            choices = example["choices"]
+            answer = example["answerKey"]
+            # Skipping every canceled question!
+            if answer == "ANULADO":
+                continue
+            answers_dict = dict(zip(choices["label"], choices["text"]))
+            correct_answer = answers_dict[answer]
+
+            def answer_to_reference(answer: str) -> Reference:
+                return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+
+            instance = Instance(
+                input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
+            )
+            instances.append(instance)
+        return instances
diff --git a/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py b/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py
@@ -0,0 +1,53 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
+
+
+@pytest.mark.scenarios
+def test_enem_challenge_scenario():
+    enem_scenario = ENEMChallengeScenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = enem_scenario.get_instances(tmpdir)
+    assert len(instances) == 1431
+    assert instances[0].split == TEST_SPLIT
+
+    assert instances[0].input.text.startswith(
+        "A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)"
+    )
+    assert len(instances[0].input.text) == 1163
+
+    assert instances[0].references == [
+        Reference(
+            output=Output(
+                text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. "  # noqa: E501
+            ),
+            tags=[CORRECT_TAG],
+        ),
+        Reference(
+            output=Output(
+                text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+    ]
+    assert instances[0].references[2].is_correct
diff --git a/src/helm/benchmark/static/schema_enem_challenge.yaml b/src/helm/benchmark/static/schema_enem_challenge.yaml
@@ -0,0 +1,146 @@
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_exact_match
+    display_name: Quasi-exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference up to light processing.
+    lower_is_better: false
+  - name: prefix_exact_match
+    display_name: Prefix exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_prefix_exact_match
+    # TODO: should call this prefix_quasi_exact_match
+    display_name: Prefix quasi-exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
+    lower_is_better: false
+
+
+############################################################
+perturbations: []
+
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+
+  # - name: efficiency
+  #   display_name: Efficiency
+  #   metrics:
+  #   - name: inference_runtime
+  #     split: ${main_split}
+
+  - name: general_information
+    display_name: General information
+    hide_win_rates: true
+    metrics:
+    - name: num_instances
+      split: ${main_split}
+    - name: num_train_instances
+      split: ${main_split}
+    - name: prompt_truncated
+      split: ${main_split}
+    - name: num_prompt_tokens
+      split: ${main_split}
+    - name: num_output_tokens
+      split: ${main_split}
+
+############################################################
+run_groups:
+  - name: core_scenarios
+    display_name: Core Scenarios
+    description: Core Scenarios
+    category: All scenarios
+    subgroups:
+      - enem_challenge
+
+  - name: enem_challenge
+    display_name: ENEM Challenge
+    description: ENEM Challenge
+    metric_groups:
+      - accuracy
+    # - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: "multiple-choice question answering"
+      what: "general academic subjects"
+      who: "brazilian ministry of education"
+      when: "between 2009 and 2023"
+      language: Portuguese
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
@@ -2872,3 +2872,12 @@ model_deployments:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
       args:
         pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
+
+  - name: huggingface/sabia-7b
+    model_name: maritaca-ai/sabia-7b
+    tokenizer_name: maritaca-ai/sabia-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+    args:
+      pretrained_model_name_or_path: maritaca-ai/sabia-7b
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
@@ -3499,3 +3499,13 @@ models:
     num_parameters: 1380000000
     release: 2024-10-21
     tags: [TEXT_MODEL_TAG]
+
+  - name: maritaca-ai/sabia-7b
+    display_name: Sabia 7B
+    description: Sabia 7B
+    creator_organization_name: MARITACA-AI
+    access: open
+    num_parameters: 6740000000
+    release_date: 2023-11-08
+    tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
@@ -750,3 +750,11 @@ tokenizer_configs:
         pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
     end_of_text_token: ""
     prefix_token: ""
+
+  - name: maritaca-ai/sabia-7b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: maritaca-ai/sabia-7b
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"