From 0f6e7792d823e2b8d3fb2840e8b2b7fd357630f7 Mon Sep 17 00:00:00 2001 From: huyiwen <1020030101@qq.com> Date: Thu, 29 Aug 2024 16:57:01 +0800 Subject: [PATCH 01/10] [dataset] Add 3 Imbue datasets --- utilization/dataset/imbue_code.py | 38 ++++++++++++++++++++++++ utilization/dataset/imbue_private.py | 43 ++++++++++++++++++++++++++++ utilization/dataset/imbue_public.py | 43 ++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 utilization/dataset/imbue_code.py create mode 100644 utilization/dataset/imbue_private.py create mode 100644 utilization/dataset/imbue_public.py diff --git a/utilization/dataset/imbue_code.py b/utilization/dataset/imbue_code.py new file mode 100644 index 00000000..ba1c90f9 --- /dev/null +++ b/utilization/dataset/imbue_code.py @@ -0,0 +1,38 @@ +from functools import cached_property +from logging import getLogger + +from .multiple_choice_dataset import MultipleChoiceDataset + +logger = getLogger(__name__) + + +class ImbueCode(MultipleChoiceDataset): + """The dataset of Imbue code understanding questions. + + These examples fall into 2 categories: + - "cloze": fill in the hole to produce the specified outcome; + - "eval": given a snippet of python code, determine the outcome. + Some questions are very easy, some are much more challenging. Most (if not all) of these questions should be relatively straightforward for an experienced programmer, even without a pencil and paper. Released as part of Imbue's 70b evals post. + + Link: https://huggingface.co/datasets/imbue/code-comprehension?row=0 + + Example (To avoid data contamination, some fields are omitted): + 'question': 'If we execute the code below, what will `result` be equal to? ```python ... ```' + 'choices': [ "'66-66-66-foo'", "'foo-66-66-66'", "'66--66--66--foo'", "''" ] + 'correct_answer': '66- ... -foo' + """ + + instruction = "{{question}}{{'\n' + options if options}}\nAnswer:" + evaluation_set = "train" + example_set = None + load_args = ("imbue/code-comprehension", ) + + def format_instance(self, instance): + instance["target_idx"] = instance["choices"].index( + instance["correct_answer"]) + instance["options"] = instance["choices"] + return instance + + @cached_property + def references(self): + return [instance["target_idx"] for instance in self.evaluation_data] diff --git a/utilization/dataset/imbue_private.py b/utilization/dataset/imbue_private.py new file mode 100644 index 00000000..bd9d8033 --- /dev/null +++ b/utilization/dataset/imbue_private.py @@ -0,0 +1,43 @@ +from functools import cached_property +from logging import getLogger + +from .multiple_choice_dataset import MultipleChoiceDataset + +logger = getLogger(__name__) + + +class ImbuePrivate(MultipleChoiceDataset): + """The dataset of Imbue private evaluations. + + High-quality question-answer pairs, from private versions of datasets designed to mimic ANLI, ARC, BoolQ, ETHICS, GSM8K, HellaSwag, OpenBookQA, MultiRC, RACE, Social IQa, and WinoGrande. For details, see https://imbue.com/research/70b-evals/. Format: each row contains a question, candidate answers, the correct answer (or multiple correct answers in the case of MultiRC questions), and a question quality score. + + Link: https://huggingface.co/datasets/imbue/high_quality_private_evaluations + + Example (To avoid data contamination, some fields are omitted): + 'question': 'For this question, first read the passage below. "The artist ..." Based on the passage above, answer the following question. Which wealth ...?' + 'correct_choices': [ "A ... ire" ] + 'choices': [ "A billionaire", "A centimillionaire", "A trillionaire", "A decamillionaire" ] + 'quality': 0.245109 + 'original_dataset': race + """ + + instruction = "{{question}}{{'\n' + options if options}}\nAnswer:" + evaluation_set = "train" + example_set = None + load_args = ("imbue/high_quality_private_evaluations", ) + category_column = "original_dataset" + + def format_instance(self, instance): + if len(instance["correct_choices"]) > 1: + logger.warning( + f"Multiple correct choices found: {len(instance['correct_choices'])}. Only the first one is used. Multiple correct choices may be supported in the future." + ) + + correct_choice = instance["correct_choices"][0] + instance["target_idx"] = instance["choices"].index(correct_choice) + instance["options"] = instance["choices"] + return instance + + @cached_property + def references(self): + return [instance["target_idx"] for instance in self.evaluation_data] diff --git a/utilization/dataset/imbue_public.py b/utilization/dataset/imbue_public.py new file mode 100644 index 00000000..e9429335 --- /dev/null +++ b/utilization/dataset/imbue_public.py @@ -0,0 +1,43 @@ +from functools import cached_property +from logging import getLogger + +from .multiple_choice_dataset import MultipleChoiceDataset + +logger = getLogger(__name__) + + +class ImbuePublic(MultipleChoiceDataset): + """The dataset of Imbue public evaluations. + + High-quality question-answer pairs, originally from ANLI, ARC, BoolQ, ETHICS, GSM8K, HellaSwag, OpenBookQA, MultiRC, RACE, Social IQa, and WinoGrande. For details, see https://imbue.com/research/70b-evals/. Format: each row contains a question, candidate answers, the correct answer (or multiple correct answers in the case of MultiRC questions), and a question quality score. + + Link: https://huggingface.co/datasets/imbue/high_quality_public_evaluations + + Example: + 'question': 'The man was released from jail. What is the cause of this?' + 'correct_choices': [ "His family paid his bail." ] + 'choices': [ "His family paid his bail.", "He attacked a fellow inmate." ] + 'quality': 0.348698 + 'original_dataset': copa + """ + + instruction = "{{question}}{{'\n' + options if options}}\nAnswer:" + evaluation_set = "train" + example_set = None + load_args = ("imbue/high_quality_public_evaluations", ) + category_column = "original_dataset" + + def format_instance(self, instance): + if len(instance["correct_choices"]) > 1: + logger.warning( + f"Multiple correct choices found: {len(instance['correct_choices'])}. Only the first one is used. Multiple correct choices may be supported in the future." + ) + + correct_choice = instance["correct_choices"][0] + instance["target_idx"] = instance["choices"].index(correct_choice) + instance["options"] = instance["choices"] + return instance + + @cached_property + def references(self): + return [instance["target_idx"] for instance in self.evaluation_data] From dc64f741d748b9e5818dc874b21ff6724c2d8af4 Mon Sep 17 00:00:00 2001 From: huyiwen <1020030101@qq.com> Date: Thu, 29 Aug 2024 19:16:52 +0800 Subject: [PATCH 02/10] [doc] add imbue --- README.md | 4 +-- docs/utilization/supported-datasets.md | 35 +++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index af340433..c3e18b94 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Training Utilization - **Blazingly Fast:** By managing the KV Cache of prefixes, we can speed up local inference by up to 6x 🚀. -- **Comprehensive Evaluation:** 56+ commonly used [datasets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md) and benchmarks in evaluating LLMs. +- **Comprehensive Evaluation:** 59+ commonly used [datasets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md) and benchmarks in evaluating LLMs. - **Evaluation Methods:** Accurately reproduce results from original papers of OpenAI, LLaMA, Mistral, and other models. - **In-Context Learning:** We support various ICL strategies, including [`KATE`](https://aclanthology.org/2022.deelio-1.10/), [`GlobalE`](https://aclanthology.org/2022.acl-long.556/), and [`APE`](https://arxiv.org/abs/2211.01910). - **Chain-of-Thought:** For some datasets, we support three types of CoT evaluation: `base`, [`least-to-most`](https://arxiv.org/abs/2205.10625), and [`pal`](https://arxiv.org/abs/2211.10435). @@ -140,7 +140,7 @@ For more details, view the [training](https://github.com/RUCAIBox/LLMBox/tree/ma We provide a broad support on Huggingface models (e.g. `LLaMA-3`, `Mistral`, or the model you are building on), OpenAI, Anthropic, QWen and other OpenAI-compatible models for further utilization. Full list of model backends: [here](https://github.com/RUCAIBox/LLMBox/tree/main/utilization#supported-models). -Currently a total of 56+ commonly used datasets are supported, including: `HellaSwag`, `MMLU`, `GSM8K`, `GPQA`, `AGIEval`, `CEval`, and `CMMLU`. Full list of datasets: [here](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md). +Currently a total of 59+ commonly used datasets are supported, including: `HellaSwag`, `MMLU`, `GSM8K`, `GPQA`, `AGIEval`, `CEval`, and `CMMLU`. Full list of datasets: [here](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md). ```bash CUDA_VISIBLE_DEVICES=0 python inference.py \ diff --git a/docs/utilization/supported-datasets.md b/docs/utilization/supported-datasets.md index c7f762d3..d211431a 100644 --- a/docs/utilization/supported-datasets.md +++ b/docs/utilization/supported-datasets.md @@ -1,6 +1,12 @@ # Supported Datasets of LLMBox -We currently support 56+ commonly used datasets for LLMs. Each dataset is either a multiple-choice dataset or a generation dataset. +We currently support 59+ commonly used datasets for LLMs. + +## Understanding Evaluation Type + +Each dataset is either a multiple-choice dataset or a generation dataset. You can find the difference between them at [here](https://github.com/RUCAIBox/LLMBox/tree/main/utilization#dataset-arguments) + +## Understanding Subsets Some datasets have multiple subsets. For example, Massive Multitask Language Understanding (`mmlu`) dataset contains 57 different subsets categorized into four categories: `stem`, `social_sciences`, `humanities`, and `other`. @@ -8,9 +14,9 @@ While some other dataset is a subset of another dataset. For example, Choice Of See how to [load datasets with subsets](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-load-datasets-with-subsets.md). -Some datasets support Chain-of-Thought reasoning. For example, Grade School Math 8K (`gsm8k`) supports three types of CoT: `base`, `least_to_most`, and `pal`. +## Understanding CoT -You can find the supported datasets in the following table. +Some datasets support Chain-of-Thought reasoning. For example, Grade School Math 8K (`gsm8k`) supports three types of CoT: `base`, `least_to_most`, and `pal`. ## Supported Datasets @@ -177,7 +183,7 @@ You can find the supported datasets in the following table.
GPQA
)gpqa_main
(default), gpqa_extended
, ...imbue_code
)imbue_private
)imbue_public
)lambada
)default
(default), de
, ... (source: EleutherAI/lambada_openai)Dataset | From 8b3a443b41ee9ada61e3497eafc72050c8cdcb60 Mon Sep 17 00:00:00 2001 From: huyiwen <1020030101@qq.com> Date: Thu, 29 Aug 2024 19:49:26 +0800 Subject: [PATCH 10/10] [ci] skip dashscope/resources/qwen.tiktoken not found --- tests/dry_test/test_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/dry_test/test_models.py b/tests/dry_test/test_models.py index d0203a1c..e7c9ec2e 100644 --- a/tests/dry_test/test_models.py +++ b/tests/dry_test/test_models.py @@ -26,3 +26,5 @@ def test_models_dry_run(run_evaluate, model, dataset, extra_args): run_evaluate(["-m", model, "-d", dataset, "-b", "10", "--dry_run"] + extra_args, cuda=0) except torch.cuda.OutOfMemoryError: pytest.skip(f"Out of memory error on {model} {dataset}") + except FileNotFoundError: + pytest.skip(f"File not found error on {model} {dataset}")