diff --git a/.github/workflows/isort-check.yml b/.github/workflows/isort-check.yml
index 1ffde953..2053abeb 100644
--- a/.github/workflows/isort-check.yml
+++ b/.github/workflows/isort-check.yml
@@ -7,7 +7,7 @@ on:
         - 'utilization/**'
 
 jobs:
-  build:
+  formatting-check:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
diff --git a/.github/workflows/pytest-check.yml b/.github/workflows/pytest-check.yml
index 18582d8a..0e993b46 100644
--- a/.github/workflows/pytest-check.yml
+++ b/.github/workflows/pytest-check.yml
@@ -9,30 +9,78 @@ on:
         - '.github/workflows/**'
 
 jobs:
-  build:
-    name: Run tests
+  Pytest:
+    name: subtest
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8.18"]
+        group: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
     steps:
-      - uses: szenius/set-timezone@v1.2
+      - uses: szenius/set-timezone@v2.0
         with:
-          timezoneLinux: "Europe/Berlin"
+          timezoneLinux: "Asia/Shanghai"
       - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python 3.8.18
         uses: actions/setup-python@v4
         with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install uv
-        run: pip install uv pip -U
+          python-version: 3.8.18
       - name: Install dependencies
-        run: uv pip install -r tests/requirements-tests.txt --system
-      - name: Install isolation dependencies
-        run: uv pip install vllm --no-build-isolation --system
-      - uses: pavelzw/pytest-action@v2
+        run: |
+          pip install uv pip -U
+          uv pip install -r tests/requirements-tests.txt --system
+          uv pip install vllm --no-build-isolation --system
+      - name: Run tests
+        run: pytest --cov --junit-xml=test-results.xml --splits 10 --group ${{ matrix.group }} --reruns 3 --only-rerun PermissionError
+        env:
+            GITHUB_ACTION: 1
+      - name: Surface failing tests
+        if: always()
+        uses: pmeier/pytest-results-action@multi-testsuites
         with:
-          emoji: false
-          verbose: true
-          job-summary: true
+          # A list of JUnit XML files, directories containing the former, and wildcard
+          # patterns to process.
+          # See @actions/glob for supported patterns.
+          path: test-results.xml
+
+          # (Optional) Add a summary of the results at the top of the report
+          summary: true
+
+          # (Optional) Select which results should be included in the report.
+          # Follows the same syntax as `pytest -r`
+          display-options: fEX
+
+          # (Optional) Fail the workflow if no JUnit XML was found.
+          fail-on-empty: true
+
+          # (Optional) Title of the test results section in the workflow summary
+          title: Test results
+      - name: Upload coverage
+        uses: actions/upload-artifact@v2
+        with:
+          name: coverage${{ matrix.group }}
+          path: .coverage
+
+  Coverage:
+    needs: Pytest
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.8.18
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8.18
+      - name: Install uv
+        run: |
+          pip install uv pip -U
+          uv pip install -r tests/requirements-tests.txt --system
+          uv pip install vllm --no-build-isolation --system
+      - name: Download all artifacts
+        # Downloads coverage1, coverage2, etc.
+        uses: actions/download-artifact@v2
+      - name: Run coverage
+        run: |
+          coverage combine coverage*/.coverage*
+          coverage report --fail-under=90
+          coverage xml
+      - uses: codecov/codecov-action@v1
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..ce8974b5
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,82 @@
+# Contributing
+
+Thanks for your interest in contributing to LLMBox! We welcome and appreciate contributions.
+To report bugs, create a [GitHub issue](https://github.com/RUCAIBox/LLMBox/issues).
+
+## Contribution Guide
+### 1. Fork the Official Repository
+
+Fork [LLMBox repository](https://github.com/RUCAIBox/LLMBox) into your own account.
+Clone your own forked repository into your local environment.
+
+```shell
+git clone git@github.com:<YOUR-USERNAME>/LLMBox.git
+```
+
+### 2. Configure Git
+
+Set the official repository as your [upstream](https://www.atlassian.com/git/tutorials/git-forks-and-upstreams) to synchronize with the latest update in the official repository.
+Add the original repository as upstream
+
+```shell
+cd LLMBox
+git remote add upstream git@github.com:RUCAIBox/LLMBox.git
+```
+
+Verify that the remote is set.
+```shell
+git remote -v
+```
+You should see both `origin` and `upstream` in the output.
+
+### 3. Synchronize with Official Repository
+Synchronize latest commit with official repository before coding.
+
+```shell
+git fetch upstream
+git checkout main
+git merge upstream/main
+git push origin main
+```
+
+### 4. Create a New Branch And Open a Pull Request
+After you finish implementation, open forked repository. The source branch is your new branch, and the target branch is `RUCAIBox/LLMBox` `main` branch. Then PR should appears in [LLMBox PRs](https://github.com/RUCAIBox/LLMBox/pulls).
+
+Then LLMBox team will review your code.
+
+## PR Rules
+
+### 1. Pull Request title
+
+As described in [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:
+
+- `feat`: A new feature
+- `fix`: A bug fix
+- `doc`: Documentation only changes
+- `refactor`: A code change that neither fixes a bug nor adds a feature
+- `style`: A refactoring that improves code style
+- `test`: Adding missing tests or correcting existing tests
+- `ci`: Changes to CI configuration files and scripts (example scopes: `.github`, `ci` (Buildkite))
+- `revert`: Reverts a previous commit
+
+For example, a PR title could be:
+- `refactor: modify package path`
+- `feat(training): xxxx`, where `(training)` means that this PR mainly focuses on the training component.
+
+You may also check out previous PRs in the [PR list](https://github.com/RUCAIBox/LLMBox/pulls).
+
+### 2. Pull Request description
+
+- If your PR is small (such as a typo fix), you can go brief.
+- If it is large and you have changed a lot, it's better to write more details.
+
+
+## How to begin
+Please refer to the README in each module:
+- [training](./training)
+- [utilization](./utilization)
+- [docs](./docs)
+
+## Tests
+Please navigate to `tests` folder to see existing test suites.
+At the moment, we have three kinds of tests: `pytest`, `isort`, and `yapf`.
diff --git a/README.md b/README.md
index 06db941d..495f8ac1 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ bash bash/run_7b_ds3.sh
 To utilize your model, or evaluate an existing model, you can run the following command:
 
 ```python
-python inference.py -m gpt-3.5-turbo -d copa  # --num_shot 0 --model_type instruction
+python inference.py -m gpt-3.5-turbo -d copa  # --num_shot 0 --model_type chat
 ```
 
 This is default to run the OpenAI GPT 3.5 turbo model on the CoPA dataset in a zero-shot manner.
@@ -118,12 +118,11 @@ We provide a broad support on Huggingface models (e.g. `LLaMA-3`, `Mistral`, or
 Currently a total of 56+ commonly used datasets are supported, including: `HellaSwag`, `MMLU`, `GSM8K`, `GPQA`, `AGIEval`, `CEval`, and `CMMLU`. For a full list of supported models and datasets, view the [utilization](https://github.com/RUCAIBox/LLMBox/tree/main/utilization) documentation.
 
 ```bash
-python inference.py \
+CUDA_VISIBLE_DEVICES=0 python inference.py \
   -m llama-2-7b-hf \
   -d mmlu agieval:[English] \
-  --model_type instruction \
+  --model_type chat \
   --num_shot 5 \
-  --cuda 0 \
   --ranking_type ppl_no_option
 ```
 
diff --git a/docs/examples/customize_dataset.py b/docs/examples/customize_dataset.py
new file mode 100644
index 00000000..e67874ec
--- /dev/null
+++ b/docs/examples/customize_dataset.py
@@ -0,0 +1,47 @@
+import os
+import sys
+
+sys.path.append(".")
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+from utilization import DatasetArguments, ModelArguments, get_evaluator, register_dataset
+from utilization.dataset import GenerationDataset
+
+
+@register_dataset(name="my_data")
+class MyData(GenerationDataset):
+
+    instruction = "Reply to my message: {input}\nReply:"
+    metrics = []
+
+    def format_instance(self, instance: dict) -> dict:
+        return instance
+
+    @property
+    def references(self):
+        return [i["target"] for i in self.evaluation_data]
+
+
+evaluator = get_evaluator(
+    model_args=ModelArguments(model_name_or_path="gpt-4o"),
+    dataset_args=DatasetArguments(
+        dataset_names=["my_data"],
+        num_shots=1,
+        max_example_tokens=2560,
+    ),
+    evaluation_data=[
+        {
+            "input": "Hello",
+            "target": "Hi"
+        },
+        {
+            "input": "How are you?",
+            "target": "I'm fine, thank you!"
+        },
+    ],
+    example_data=[{
+        "input": "What's the weather like today?",
+        "target": "It's sunny today."
+    }]
+)
+evaluator.evaluate()
diff --git a/docs/examples/customize_huggingface_model.py b/docs/examples/customize_huggingface_model.py
index 8ba8925d..15436635 100644
--- a/docs/examples/customize_huggingface_model.py
+++ b/docs/examples/customize_huggingface_model.py
@@ -1,12 +1,14 @@
+import sys
+
 import torch
 from transformers import LlamaForCausalLM
 
-from utilization import Evaluator
-from utilization.model.huggingface_model import get_model_max_length, load_tokenizer
-from utilization.utils import DatasetArguments, ModelArguments
+sys.path.append(".")
+from utilization import DatasetArguments, ModelArguments, get_evaluator
 
 
 def load_hf_model(model_args: ModelArguments):
+    from utilization.model.huggingface_model import get_model_max_length, load_tokenizer
 
     # load your own model
     model = LlamaForCausalLM.from_pretrained(
@@ -24,7 +26,7 @@ def load_hf_model(model_args: ModelArguments):
     return model, tokenizer
 
 
-evaluator = Evaluator(
+evaluator = get_evaluator(
     model_args=ModelArguments(
         model_name_or_path="../your-model-path",
         model_type="chat",
diff --git a/docs/utilization/customize-dataset.md b/docs/utilization/how-to-customize-dataset.md
similarity index 97%
rename from docs/utilization/customize-dataset.md
rename to docs/utilization/how-to-customize-dataset.md
index c6f8db6e..c6035f41 100644
--- a/docs/utilization/customize-dataset.md
+++ b/docs/utilization/how-to-customize-dataset.md
@@ -2,6 +2,8 @@
 
 If you find some datasets are not supported in the current version, feel free to implement your own dataset and submit a PR.
 
+See a full list of supported datasets at [here](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/supported-datasets.md).
+
 ## Choose the Right Dataset
 
 We provide two types of datasets: [`GenerationDataset`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/dataset/generation_dataset.py) and [`MultipleChoiceDataset`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/dataset/multiple_choice_dataset.py).
@@ -35,7 +37,7 @@ These are the attributes you can define in a new dataset:
 
 - `example_set` (`Optional[str]`): The example split of dataset. Example data will be automatically loaded if this is not None.
 
-- `load_args` (`Union[Tuple[str], Tuple[str, str], Tuple[()]]`, **required\***): Arguments for loading the dataset with huggingface `load_dataset`. See [load from source data](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/customize-dataset.md#load-from-source-data) for details.
+- `load_args` (`Union[Tuple[str], Tuple[str, str], Tuple[()]]`, **required\***): Arguments for loading the dataset with huggingface `load_dataset`. See [load from source data](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-customize-dataset.md#load-from-source-data) for details.
 
 - `extra_model_args` (`Dict[str, Any]`): Extra arguments for the model like `temperature`, `stop` etc. See `set_generation_args`, `set_prob_args`, and `set_ppl_args` for details.
 
@@ -45,7 +47,7 @@ Then implement the following methods or properties:
 - `references` (**required**): Return the reference answers for evaluation.
 - `init_arguments`: Initialize the arguments for the dataset. This is called before the raw dataset is loaded.
 
-See [here](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/customize-dataset.md#advanced-topics) for advanced topics.
+See [here](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-customize-dataset.md#advanced-topics) for advanced topics.
 
 
 ## Load from Source Data
diff --git a/docs/utilization/how-to-customize-model.md b/docs/utilization/how-to-customize-model.md
new file mode 100644
index 00000000..b516d1d7
--- /dev/null
+++ b/docs/utilization/how-to-customize-model.md
@@ -0,0 +1,28 @@
+# How to Customize Model
+
+## Customizing HuggingFace Models
+
+If you are building on your own model, such as using a fine-tuned model, you can evaluate it easily from python script. Detailed steps and example code are provided in the [customize HuggingFace model guide](https://github.com/RUCAIBox/LLMBox/tree/main/docs/examples/customize_huggingface_model.py).
+
+## Adding a New Model Provider
+
+If you're integrating a new model provider, begin by extending the [`Model`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/model/model.py) class. Implement essential methods such as `generation`, `get_ppl` (get perplexity), and `get_prob` (get probability) to support different functionalities. For instance, here's how you might implement the `generation` method for a new model:
+
+```python
+class NewModel(Model):
+
+    model_backend = "new_provider"
+
+    def call_model(self, batched_inputs: List[str]) -> List[Any]:
+        return ...  # call to model, e.g., self.model.generate(...)
+
+    def to_text(self, result: Any) -> str:
+        return ...  # convert result to text, e.g., result['text']
+
+    def generation(self, batched_inputs: List[str]) -> List[str]:
+        results = self.call_model(batched_inputs)
+        results = [to_text(result) for result in results]
+        return results
+```
+
+And then, you should register your model in the [`load`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/model/load.py) file.
diff --git a/docs/utilization/how-to-load-datasets-from-huggingface.md b/docs/utilization/how-to-load-datasets-from-huggingface.md
new file mode 100644
index 00000000..d379f74d
--- /dev/null
+++ b/docs/utilization/how-to-load-datasets-from-huggingface.md
@@ -0,0 +1,76 @@
+# How to Load Datasets from Hugging Face
+
+In this tutorial, we will learn how to download datasets from Hugging Face using the [`datasets`](https://huggingface.co/docs/datasets/en/index) library. The `datasets` library is a powerful tool that allows you to easily download and work with datasets from [Hugging Face](https://huggingface.co/datasets).
+
+See a full list of supported datasets at [here](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/supported-datasets.md).
+
+## Case 1: Directly load from Hugging Face
+
+By default, `LLMBox` will handle everything for you. You just need to specify the dataset name in the command line.
+
+```python
+python inference.py -m model -d mmlu
+```
+
+The dataset will be downloaded and cached in the `~/.cache/huggingface/datasets` directory.
+
+## Case 2: Load from a Hugging Face mirror
+
+Datasets
+
+To load a dataset from a Hugging Face mirror, you can use the `--hf_mirror` flag. The dataset will be downloaded from Hugging Face mirror using `hfd.sh`.
+
+This is an experimental feature and may not work in some environments. If you encounter any issues, please let us know.
+
+```shell
+python inference.py -m model -d mmlu --hf_mirror
+```
+
+`hfd.sh` is a slightly modified version of `huggingface-cli` download [wrapper](https://gist.github.com/padeoe/697678ab8e528b85a2a7bddafea1fa4f/), which offers a more stable and faster download speed than the original `huggingface-cli`.
+
+`hfd.sh` will download the dataset from the Hugging Face mirror and cache it in the `~/.cache/huggingface/datasets` directory. Then `datasets` will load the dataset from the cache.
+
+The next time you run the command, `datasets` will directly load the dataset from the cache:
+
+```shell
+python inference.py -m another-model -d mmlu
+```
+
+## Case 3: Load local dataset in offline mode
+
+If you have already downloaded the dataset and want to load it in offline mode, you can use `--dataset_path` to specify the dataset path.
+
+```shell
+python inference.py -m model -d mmlu --dataset_path path/to/mmlu
+```
+
+The dataset will be loaded from the specified path.
+
+
+```bash
+# from a cloned directory of the huggingface dataset repository:
+python inference.py -d copa --dataset_path /path/to/copa
+
+# from a local (nested) directory saved by `dataset.save_to_disk`:
+python inference.py -d race --dataset_path /path/to/race/middle
+python inference.py -d race:middle --dataset_path /path/to/race
+python inference.py -d race:middle --dataset_path /path/to/race/middle
+python inference.py -d race:middle,high --dataset_path /path/to/race
+```
+
+`dataset_path` can also accept a dataset file or a directory containing these files (supports json, jsonl, csv, and txt):
+```bash
+# load one split from one subset only
+python inference.py -d gsm8k --dataset_path /path/to/gsm.jsonl
+python inference.py -d race --dataset_path /path/to/race/middle/train.json
+
+# load test and train splits from middle subset (a directory contains `/path/to/race/middle/train.json` and `/path/to/race/middle/test.json`)
+python inference.py -d race --dataset_path /path/to/race/middle --evaluation_set "test[:10]" --example_set "train"
+
+# load test and train splits from middle and high subsets (a nested directory)
+python inference.py -d race:middle,high --dataset_path /path/to/race --evaluation_set "test[:10]" --example_set "train"
+
+# load test and train splits from middle and high subsets with a filename pattern
+python inference.py -d race:middle,high --evaluation_set "test[:10]" --example_set "train" --dataset_path "/pattern/of/race_{subset}_{split}.json"
+python inference.py -d mmlu --evaluation_set val --example_set dev --dataset_path "/pattern/of/mmlu/{split}/{subset}_{split}.csv"
+```
diff --git a/docs/utilization/how-to-load-datasets-with-subsets.md b/docs/utilization/how-to-load-datasets-with-subsets.md
new file mode 100644
index 00000000..05dad67a
--- /dev/null
+++ b/docs/utilization/how-to-load-datasets-with-subsets.md
@@ -0,0 +1,74 @@
+# How to Load Datasets with Subsets
+
+Some datasets have multiple subsets. For example, Massive Multitask Language Understanding (`mmlu`) dataset contains 57 different subsets categorized into four categories: `stem`, `social_sciences`, `humanities`, and `other`.
+
+While some other dataset is a subset of another dataset. For example, Choice Of Plausible Alternatives (`copa`) is a subset of `super_glue`.
+
+See a full list of supported datasets at [here](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/supported-datasets.md).
+
+## Load from huggingface server
+
+We use the `datasets` library to load the dataset from the huggingface server. If you have issue connecting to the Internet or the Hugging Face server, see [here](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-load-datasets-from-huggingface.md) for help.
+
+Load a dataset that is a subset of another dataset (e.g. `copa`):
+
+```shell
+python inference.py -d copa
+```
+
+Load a dataset with multiple subsets (e.g. `mmlu`):
+
+```shell
+python inference.py -d mmlu:abstract_algebra,human_sexuality
+```
+
+In some cases, you may want to load a specific split of the dataset (e.g. `test`, `dev`, `validation`, ...). Both `evaluation_set` and `example_set` support the Huggingface [String API](https://huggingface.co/docs/datasets/loading#slice-splits):
+
+```shell
+python inference.py -d race:middle,high --evaluation_set "test[:10]" --example_set "train"
+```
+
+## Understand the behaviour of subsets
+
+By default we load all the subsets of a dataset:
+
+```shell
+python inference.py -m model -d mmlu
+# expands to all 57 subsets
+# equivalent: mmlu:abstract_algebra,human_sexuality,human_sexuality,...
+# equivalent: mmlu:[stem],[social_sciences],[humanities],[other]
+```
+
+```shell
+python inference.py -m model -d arc
+# equivalent: arc:ARC-Easy,ARC-Challenge
+```
+
+Unless a default subset is defined (see [supported datsaets](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/supported-datasets.md) for all the default subsets):
+
+```bash
+python inference.py -m model -d cnn_dailymail
+# equivalent: cnn_dailymail:3.0.0
+```
+
+Some datasets like GPQA (Google-Proof Q&A) have to load example set separately. You need to download the dataset to any directory and provide the path to the dataset:
+
+```bash
+# few_shot
+python inference.py -m model -d gpqa --ranking_type generation -shots 5 --example_set "../gpqa/prompts"
+```
+
+## Overriding `load_raw_dataset` function
+
+Also feel free to override this function if you want to load the dataset in a different way:
+
+```python
+from .utils import load_raw_dataset_from_file, get_raw_dataset_loader
+
+class MyDataset(Dataset):
+    def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_set):
+        self.evaluation_data = get_raw_dataset_loader(...)("test")
+        self.example_data = load_raw_dataset_from_file("examples.json")
+```
+
+For more details on how to customize the dataset, see this [guide](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-customize-dataset.md).
diff --git a/docs/utilization/supported-datasets.md b/docs/utilization/supported-datasets.md
new file mode 100644
index 00000000..c7f762d3
--- /dev/null
+++ b/docs/utilization/supported-datasets.md
@@ -0,0 +1,425 @@
+# Supported Datasets of LLMBox
+
+We currently support 56+ commonly used datasets for LLMs. Each dataset is either a multiple-choice dataset or a generation dataset.
+
+Some datasets have multiple subsets. For example, Massive Multitask Language Understanding (`mmlu`) dataset contains 57 different subsets categorized into four categories: `stem`, `social_sciences`, `humanities`, and `other`.
+
+While some other dataset is a subset of another dataset. For example, Choice Of Plausible Alternatives (`copa`) is a subset of `super_glue`.
+
+See how to [load datasets with subsets](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-load-datasets-with-subsets.md).
+
+Some datasets support Chain-of-Thought reasoning. For example, Grade School Math 8K (`gsm8k`) supports three types of CoT: `base`, `least_to_most`, and `pal`.
+
+You can find the supported datasets in the following table.
+
+## Supported Datasets
+
+<table>
+  <tr>
+      <td><b>Dataset</b></td>
+      <td><b>Subsets / Collections</b></td>
+      <td><b>Evaluation Type</b></td>
+      <td><b>CoT</b></td>
+      <td><b>Notes</b></td>
+  </tr>
+  <tr>
+      <td rowspan=3>AGIEval(<code>agieval</code>, alias of <code>agieval_single_choice</code> and <code>agieval_cot</code>)</td>
+      <td><b>English</b>: <code>sat-en</code>, <code>sat-math</code>, <code>lsat-ar</code>, <code>lsat-lr</code>, <code>lsat-rc</code>, <code>logiqa-en</code>, <code>aqua-rat</code>, <code>sat-en-without-passage</code></td>
+      <td rowspan=2>MultipleChoice</td>
+      <td></td>
+      <td rowspan=3></td>
+  </tr>
+  <tr>
+      <td><code>gaokao-chinese</code>, <code>gaokao-geography</code>, <code>gaokao-history</code>, <code>gaokao-biology</code>, <code>gaokao-chemistry</code>, <code>gaokao-english</code>, <code>logiqa-zh</code></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td><code>jec-qa-kd</code>, <code>jec-qa-ca</code>, <code>math</code>, <code>gaokao-physics</code>, <code>gaokao-mathcloze</code>, <code>gaokao-mathqa</code></td>
+      <td>Generation</td>
+      <td>✅</td>
+  </tr>
+  <tr>
+      <td>Alpacal Eval (<code>alpaca_eval</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td>Single GPTEval</td>
+  </tr>
+  <tr>
+      <td>Adversarial Natural Language Inference (<code>anli</code>)</td>
+      <td><code>Round2</code> (default)</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>AI2's Reasoning Challenge (<code>arc</code>)</td>
+      <td><code>ARC-Easy</code>, <code>ARC-Challenge</code></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td>Normalization</td>
+  </tr>
+  <tr>
+      <td>BIG-Bench Hard (<code>bbh</code>)</td>
+      <td><code>boolean_expressions</code>, ...</td>
+      <td>Generation</td>
+      <td>✅</td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Boolean Questions (<code>boolq</code>)</td>
+      <td><i>super_glue</i></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>CommitmentBank (<code>cb</code>)</td>
+      <td><i>super_glue</i></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td rowspan=4>C-Eval (<code>ceval</code>)</td>
+      <td><b>stem</b>: <code>advanced_mathematics</code>, <code>college_chemistry</code>, ...</td>
+      <td rowspan=4>MultipleChoice</td>
+      <td rowspan=4></td>
+      <td rowspan=4></td>
+  </tr>
+  <tr>
+      <td><b>social science</b>: <code>business_administration</code>, <code>college_economics</code>, ...</td>
+  </tr>
+  <tr>
+      <td><b>humanities</b>: <code>art_studies</code>, <code>chinese_language_and_literature</code>, ...</td>
+  </tr>
+  <tr>
+      <td><b>other</b>: <code>accountant</code>, <code>basic_medicine</code>, ...</td>
+  </tr>
+  <tr>
+      <td rowspan=4>Massive Multitask Language Understanding in Chinese (<code>cmmlu</code>)</td>
+      <td><b>stem</b>: <code>anatomy</code>, <code>astronomy</code>, ...</td>
+      <td rowspan=4>MultipleChoice</td>
+      <td rowspan=4></td>
+      <td rowspan=4></td>
+  </tr>
+  <tr>
+      <td><b>social science</b>: <code>ancient_chinese</code>, <code>business_ethics</code>, ...</td>
+  </tr>
+  <tr>
+      <td><b>humanities</b>: <code>arts</code>, <code>chinese_history</code>, ...</td>
+  </tr>
+  <tr>
+      <td><b>other</b>: <code>agronomy</code>, <code>chinese_driving_rule</code>, ...</td>
+  </tr>
+  <tr>
+      <td>CNN Dailymail (<code>cnn_dailymail</code>)</td>
+      <td><code>3.0.0</code> (default), ...</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Reasoning About Colored Objects (<code>color_objects</code>)</td>
+      <td><i>bigbench</i> (reasoning_about_colored_objects)</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Commonsense QA (<code>commonsenseqa</code>)</td>
+      <td>/</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Choice Of Plausible Alternatives (<code>copa</code>)</td>
+      <td><i>super_glue</i></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Conversational Question Answering (<code>coqa</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td>Download: <a href="https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json">train</a>, <a href="https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json">dev</a></td>
+  </tr>
+  <tr>
+      <td>CrowS-Pairs (<code>crows_pairs</code>)</td>
+      <td>/</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Discrete Reasoning Over the content of Paragraphs (<code>drop</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td rowspan=3>GAOKAO (<code>gaokao</code>)</td>
+      <td><b>Chinese</b>: <code>2010-2022_Chinese_Modern_Lit</code>, <code>2010-2022_Chinese_Lang_and_Usage_MCQs</code></td>
+      <td rowspan=3>Generation</td>
+      <td rowspan=3></td>
+      <td rowspan=3>Metric: Exam scoring</td>
+  </tr>
+  <tr>
+      <td><b>English</b>: <code>2010-2022_English_Reading_Comp</code>, <code>2010-2022_English_Fill_in_Blanks</code>, ...</td>
+  </tr>
+  <tr>
+      <td><code>2010-2022_Math_II_MCQs</code>, <code>2010-2022_Math_I_MCQs</code>, ...</td>
+  </tr>
+  <tr>
+      <td>Google-Proof Q&A (<code>GPQA</code>)</td>
+      <td><code>gpqa_main</code> (default), <code>gpqa_extended</code>, ...</td>
+      <td>MultipleChoiceDataset</td>
+      <td>✅</td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Grade School Math 8K (<code>gsm8k</code>)</td>
+      <td><code>main</code> (default), <code>socratic</code></td>
+      <td>Generation</td>
+      <td>✅</td>
+      <td>Code exec</td>
+  </tr>
+  <tr>
+      <td>HaluEval(<code>halueval</code>)</td>
+      <td><code>dialogue_samples</code>, <code>qa_samples</code>, <code>summarization_samples</code></td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>HellaSWAG (<code>hellaswag</code>)</td>
+      <td>/</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>HumanEval (<code>humaneval</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td>Pass@K</td>
+  </tr>
+  <tr>
+      <td>Instruction-Following Evaluation (<code>ifeval</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>LAnguage Modeling Broadened to Account for Discourse Aspects (<code>lambada</code>)</td>
+      <td><code>default</code> (default), <code>de</code>, ... (source: <i>EleutherAI/lambada_openai</i>)</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Mathematics Aptitude Test of Heuristics (<code>math</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td>✅</td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Mostly Basic Python Problems (<code>mbpp</code>)</td>
+      <td><code>full</code> (default), <code>sanitized</code></td>
+      <td>Generation</td>
+      <td></td>
+      <td>Pass@K</td>
+  </tr>
+  <tr>
+      <td rowspan=4>Massive Multitask Language Understanding(<code>mmlu</code>)</td>
+      <td><b>stem</b>: <code>abstract_algebra</code>, <code>astronomy</code>, ...</td>
+      <td rowspan=4>MultipleChoice</td>
+      <td rowspan=4></td>
+      <td rowspan=4></td>
+  </tr>
+  <tr>
+      <td><b>social_sciences</b>: <code>econometrics</code>, <code>high_school_geography</code>, ...</td>
+  </tr>
+  <tr>
+      <td><b>humanities</b>: <code>formal_logic</code>, <code>high_school_european_history</code>, ...</td>
+  </tr>
+  <tr>
+      <td><b>other</b>: <code>anatomy</code>, <code>business_ethics</code>, ...</td>
+  </tr>
+  <tr>
+      <td>Multi-turn Benchmark (<code>mt_bench</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td>Multi-turn GPTEval</td>
+  </tr>
+  <tr>
+      <td>Natural Questions(<code>nq</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>OpenBookQA (<code>openbookqa</code>)</td>
+      <td><code>main</code> (default), <code>additional</code></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td>Normalization</td>
+  </tr>
+  <tr>
+      <td>Penguins In A Table (<code>penguins_in_a_table</code>)</td>
+      <td><i>bigbench</i></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Physical Interaction: Question Answering (<code>piqa</code>)</td>
+      <td>/</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Question Answering in Context (<code>quac</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>ReAding Comprehension (<code>race</code>)</td>
+      <td><code>high</code>, <code>middle</code></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td>Normalization</td>
+  </tr>
+  <tr>
+      <td>Real Toxicity Prompts (<code>real_toxicity_prompts</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td><a href="https://www.perspectiveapi.com/">Perspective</a> Toxicity</td>
+  </tr>
+  <tr>
+      <td>Recognizing Textual Entailment (<code>rte</code>)</td>
+      <td><i>super_glue</i></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Social Interaction QA (<code>siqa</code>)</td>
+      <td>/</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Stanford Question Answering Dataset (<code>squad, squad_v2</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Story Cloze Test (<code>story_cloze</code>)</td>
+      <td><code>2016</code> (default), <code>2018</code></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td><a href='http://goo.gl/forms/aQz39sdDrO'>Manually download</a></td>
+  </tr>
+  <tr>
+      <td>TL;DR (<code>tldr</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>TriviaQA (<code>triviaqa</code>)</td>
+      <td><code>rc.wikipedia.nocontext</code> (default), <code>rc</code>, <code>rc.nocontext</code>, ...</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>TruthfulQA (<code>truthfulqa_mc</code>)</td>
+      <td><code>multiple_choice</code> (default), <code>generation</code> (not supported)</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Vicuna Bench (<code>vicuna_bench</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td>GPTEval</td>
+  </tr>
+  <tr>
+      <td>WebQuestions (<code>webq</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Words in Context (<code>wic</code>)</td>
+      <td><i>super_glue</i></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Winogender Schemas (<code>winogender</code>)</td>
+      <td><code>main</code>, <code>gotcha</code></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td>Group by gender</td>
+  </tr>
+  <tr>
+      <td>WSC273 (<code>winograd</code>)</td>
+      <td><code>wsc273</code> (default), <code>wsc285</code></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>WinoGrande (<code>winogrande</code>)</td>
+      <td><code>winogrande_debiased</code> (default), ...</td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Conference on Machine Translation (<code>wmt21, wmt19, ...</code>)</td>
+      <td><code>en-ro</code>, <code>ro-en</code>, ...</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Winograd Schema Challenge (<code>wsc</code>)</td>
+      <td><i>super_glue</i></td>
+      <td>MultipleChoice</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <tr>
+      <td>Extreme Summarization (<code>xsum</code>)</td>
+      <td>/</td>
+      <td>Generation</td>
+      <td></td>
+      <td></td>
+  </tr>
+
+</table>
diff --git a/dry_test.sh b/dry_test.sh
deleted file mode 100644
index 2fe302eb..00000000
--- a/dry_test.sh
+++ /dev/null
@@ -1,130 +0,0 @@
-
-datasets=("agieval" "alpaca_eval" "anli" "arc" "bbh" "boolq" "cb" "ceval" "cmmlu" "cnn_dailymail" "color_objects" "commonsenseqa" "copa" "coqa" "crows_pairs" "drop" "gaokao" "gsm8k" "halueval" "hellaswag" "humaneval" "ifeval" "lambada" "math" "mbpp" "mmlu" "mt_bench" "nq" "openbookqa" "penguins_in_a_table" "piqa" "quac" "race" "real_toxicity_prompts" "rte" "siqa" "squad" "squad_v2" "story_cloze" "tldr" "triviaqa" "truthfulqa_mc" "vicuna_bench" "webq" "wic" "winogender" "winograd" "winogrande" "wmt16:de-en" "wsc" "xsum")
-
-
-
-function dry_test() {
-    echo "Running dry test on all datasets"
-    for dataset in "${datasets[@]}"
-    do
-        if [[ "$dataset" = "mbpp" || "$dataset" = "humaneval" ]]; then
-            args="--pass_at_k 1"
-        elif [[ "$dataset" = "coqa" ]]; then
-            if [[ -n "$COQA_PATH" ]]; then
-                args="--dataset_path $COQA_PATH"
-            else
-                echo "Skipping $dataset - COQA_PATH not set"
-                continue
-            fi
-        elif [[ "$dataset" = "story_cloze" ]]; then
-            if [[ -n "$STORY_CLOZE_PATH" ]]; then
-                args="--dataset_path $STORY_CLOZE_PATH"
-            else
-                echo "Skipping $dataset - STORY_CLOZE_PATH not set"
-                continue
-            fi
-        elif [[ "$dataset" = "real_toxicity_prompts" ]]; then
-            if [[ -n "$PERSPECTIVE_API_KEY" ]]; then
-                args="--perspective_api_key $PERSPECTIVE_API_KEY"
-            else
-                echo "Skipping $dataset - PERSPECTIVE_API_KEY not set"
-                continue
-            fi
-        elif [[ "$dataset" = "alpaca_eval" || "$dataset" = "mt_bench" || "$dataset" = "vicuna_bench" ]]; then
-            if [[ -n "$OPENAI_API_KEY" ]]; then
-                args="--openai_api_key $OPENAI_API_KEY"
-            else
-                echo "Skipping $dataset - OPENAI_API_KEY not set"
-                continue
-            fi
-        else
-            args=""
-        fi
-
-        echo "Running on $dataset"
-        python inference.py -m gpt2 -i 10 -d "$dataset" --dry_run True $args > /dev/null
-        if [ $? -ne 0 ]; then
-            echo " ❎"
-        else
-            echo " ✅"
-        fi
-    done
-}
-
-
-function prefix_caching_test() {
-    green="\033[32m"
-    blue="\033[34m"
-    reset="\033[0m"
-    echo "Running prefix_caching test on all datasets"
-    for dataset in "${datasets[@]}"
-    do
-        if [[ "$dataset" = "mbpp" || "$dataset" = "humaneval" ]]; then
-            args="--pass_at_k 1"
-        elif [[ "$dataset" = "coqa" ]]; then
-            if [[ -n "$COQA_PATH" ]]; then
-                args="--dataset_path $COQA_PATH"
-            else
-                echo "Skipping $dataset - COQA_PATH not set"
-                continue
-            fi
-        elif [[ "$dataset" = "story_cloze" ]]; then
-            if [[ -n "$STORY_CLOZE_PATH" ]]; then
-                args="--dataset_path $STORY_CLOZE_PATH"
-            else
-                echo "Skipping $dataset - STORY_CLOZE_PATH not set"
-                continue
-            fi
-        elif [[ "$dataset" = "real_toxicity_prompts" ]]; then
-            if [[ -n "$PERSPECTIVE_API_KEY" ]]; then
-                args="--perspective_api_key $PERSPECTIVE_API_KEY"
-            else
-                echo "Skipping $dataset - PERSPECTIVE_API_KEY not set"
-                continue
-            fi
-        elif [[ "$dataset" = "alpaca_eval" || "$dataset" = "mt_bench" || "$dataset" = "vicuna_bench" ]]; then
-            if [[ -n "$OPENAI_API_KEY" ]]; then
-                args="--openai_api_key $OPENAI_API_KEY"
-            else
-                echo "Skipping $dataset - OPENAI_API_KEY not set"
-                continue
-            fi
-        else
-            args=""
-        fi
-
-        echo -e "${green}Running on $dataset (--prefix_caching True)${reset}"
-        python inference.py -m /home/tangtianyi/Llama-2-7b-hf -d "$dataset" --max_evaluation_instances 50 -shots 5 --model_type instruction -b 20:auto $args | tail -n 2 | head -n 1 | echo -e "${blue}>>> $dataset (--prefix_caching True): $(cat)${reset}"
-
-        echo -e "${green}Running on $dataset (--prefix_caching False)${reset}"
-        python inference.py -m /home/tangtianyi/Llama-2-7b-hf -d "$dataset" --prefix_caching False --max_evaluation_instances 50 -shots 5 --model_type instruction -b 20:auto $args | tail -n 2 | head -n 1 | echo -e "${blue}>>> $dataset (--prefix_caching False): $(cat)${reset}"
-    done
-}
-
-
-
-if [[ -z $1 ]]; then
-    echo "Usage: dry_test.sh <command>"
-    echo "Commands:"
-    echo "  all: Run test on all datasets"
-    echo "  dry_test: Run dry test on all datasets"
-    echo "  prefix_caching: Run prefix caching test on all datasets"
-    exit 1
-fi
-
-if [[ -z $CUDA_VISIBLE_DEVICES ]]; then
-    CUDA_VISIBLE_DEVICES=0
-fi
-
-if [[ $1 = "all" ]]; then
-    dry_test
-    prefix_caching_test
-elif [[ $1 = "dry_test" ]]; then
-    dry_test
-elif [[ $1 = "prefix_caching" ]]; then
-    prefix_caching_test
-else
-    echo "Invalid command"
-    exit 1
-fi
-
diff --git a/inference.py b/inference.py
index 757a48b9..1724b092 100644
--- a/inference.py
+++ b/inference.py
@@ -1,11 +1,11 @@
-from utilization import Evaluator, parse_argument
+from utilization import get_evaluator, parse_argument
 
 
 def main():
     r"""The main pipeline for argument parsing, initialization, and evaluation."""
     model_args, dataset_args, evaluation_args = parse_argument(initalize=True)
 
-    evaluator = Evaluator(
+    evaluator = get_evaluator(
         model_args=model_args,
         dataset_args=dataset_args,
         evaluation_args=evaluation_args,
diff --git a/tests/dry_test/fixtures.py b/tests/dry_test/fixtures.py
index bfda4a89..1bd1eea2 100644
--- a/tests/dry_test/fixtures.py
+++ b/tests/dry_test/fixtures.py
@@ -1,25 +1,62 @@
-from typing import List
+import os
+from pathlib import Path
+from typing import Dict, List, Optional
 
 import pytest
+import requests
 
-from utilization import Evaluator, parse_argument
+
+@pytest.fixture(autouse=True)
+def run_before_and_after_tests():
+    """Fixture to execute asserts before and after a test is run"""
+    # Setup: fill with any logic you want
+
+    clear = os.environ.get("GITHUB_ACTION", None) == "1"
+    path = Path.home() / ".cache/huggingface/datasets"
+
+    if clear:
+        path.mkdir()
+
+    yield  # this is where the testing happens
+
+    if clear:
+        path.rmdir()
 
 
 @pytest.fixture
 def run_evaluate():
 
-    def evaluate(args: List[str]):
+    def evaluate(args: List[str], cuda: str = "", test_evaluation_data: Optional[Dict[str, str]] = None):
+        if cuda:
+            if isinstance(cuda, int):
+                cuda = str(cuda)
+            os.environ["CUDA_VISIBLE_DEVICES"] = cuda
+
+        from datasets.exceptions import DatasetGenerationError
+
+        from utilization import get_evaluator, parse_argument
+
         model_args, dataset_args, evaluation_args = parse_argument(
             args=args,
             initalize=True,
         )
 
-        evaluator = Evaluator(
-            model_args=model_args,
-            dataset_args=dataset_args,
-            evaluation_args=evaluation_args,
-            initalize=False,
-        )
-        return evaluator.evaluate()
+        try:
+            evaluator = get_evaluator(
+                model_args=model_args,
+                dataset_args=dataset_args,
+                evaluation_args=evaluation_args,
+                initalize=False,
+            )
+            evaluator.evaluate()
+        except (ConnectionError, requests.exceptions.ReadTimeout):
+            pytest.skip(reason="ConnectionError")
+        except DatasetGenerationError:
+            pytest.skip(reason="DatasetGenerationError")
+
+        if test_evaluation_data is not None:
+            for key, value in test_evaluation_data.items():
+                if key in evaluator.dataset._datasets:
+                    assert evaluator.dataset._datasets[key].evaluation_data[0] == value
 
     return evaluate
diff --git a/tests/dry_test/test_datasets.py b/tests/dry_test/test_datasets.py
index a65bfa80..04b5c30e 100644
--- a/tests/dry_test/test_datasets.py
+++ b/tests/dry_test/test_datasets.py
@@ -1,37 +1,40 @@
+import nltk
 import pytest
 
 from .fixtures import run_evaluate
 
+nltk.download('punkt')
+
 datasets = {
     "agieval": [],
-    "alpaca_eval": None,
+    "alpaca_eval": "skip",
     "anli": [],
     "arc": [],
     "bbh": [],
     "boolq": [],
     "cb": [],
-    "ceval": [],
-    "cmmlu": [],
+    "ceval": ["--no_dataset_threading"],  # dataset threading has issues with pytest
+    "cmmlu": ["--no_dataset_threading"],
     "cnn_dailymail": [],
     "color_objects": [],
     "commonsenseqa": [],
     "copa": [],
-    "coqa": None,
-    "crows_pairs": None,
+    "coqa": "skip",
+    # "crows_pairs": "does not support api model",
     "drop": [],
     "gaokao": [],
     "gsm8k": [],
-    "gpqa": [],
+    "gpqa": "requires authentication",
     "halueval": [],
     "hellaswag": [],
     "humaneval": ["--pass_at_k", "1"],
     "ifeval": [],
     "lambada": [],
-    "math": [],
+    "math": ["--no_dataset_threading"],
     "mbpp": ["--pass_at_k", "1"],
     "mmlu": [],
     "mrpc": [],
-    "mt_bench": None,
+    "mt_bench": "skip",
     "nq": [],
     "openbookqa": [],
     "penguins_in_a_table": [],
@@ -39,40 +42,75 @@
     "qnli": [],
     "quac": [],
     "race": [],
-    "real_toxicity_prompts": None,
+    "real_toxicity_prompts": "skip",
     "rte": [],
     "siqa": [],
     "sst2": [],
     "squad": [],
     "squad_v2": [],
-    "story_cloze": None,
+    "story_cloze": "skip",
     "tldr": [],
     "triviaqa": [],
     "truthfulqa_mc": [],
     "tydiqa": [],
-    "vicuna_bench": None,
+    "vicuna_bench": "skip",
     "webq": [],
     "wic": [],
     "winogender": [],
-    "winograd": [],
+    # "winograd": "does not support api model",
     "winogrande": [],
     "wmt16:de-en": [],
     "wsc": [],
     "xcopa": [],
-    "xlsum": [],
-    "xsum": [],
+    "xlsum": "dataset too large",
+    "xsum": ["--no_dataset_threading"],
+}
+
+test_evaluation_data = {
+    "agieval:aqua-rat": (
+        'Q: A car is being driven, in a straight line and at a uniform speed, towards the base of a '
+        'vertical tower. The top of the tower is observed from the car and, in the process, it takes 10 '
+        'minutes for the angle of elevation to change from 45° to 60°. After how much more time will this '
+        'car reach the base of the tower?\n'
+        'None\n'
+        'Answer: Among A through E, the answer is 5(√3 + 1)'
+    ),
+    "agieval:gaokao-mathcloze": (
+        '问题：已知 $a \\in \\mathrm{R}$, 函数 $f(x)=\\left\\{\\begin{array}{l}x^{2}-4, x>2 \\\\ |x-3|+a, x '
+        '\\leq 2,\\end{array}\\right.$ 若 $f[f(\\sqrt{6})]=3$, 则 $a=(\\quad)$\n'
+        '答案：'
+    )
 }
 
 
 @pytest.mark.parametrize("dataset, extra_args", datasets.items())
 def test_datasets_dry_run(run_evaluate, dataset, extra_args):
-    if extra_args is None:
+    """You may re-run one of these tests (e.g. ceval) with:
+
+    `pytest tests/dry_test/test_datasets.py::test_datasets_dry_run[ceval-extra_args7]`
+    """
+    if not isinstance(extra_args, list):
         return
+
     run_evaluate(
-        ["-m", "gpt-3.5-turbo", "-d", dataset, "-b", "10", "--dry_run", "--cuda", "0", "--openai_api_key", "fake_key"] +
-        extra_args
+        ["-m", "gpt-3.5-turbo", "-d", dataset, "-b", "10", "--dry_run", "--openai_api_key", "fake_key", "-i", "5"] +
+        extra_args,
+        cuda=0,
+        test_evaluation_data=test_evaluation_data,
+    )
+
+
+def test_winograd_dry_run(run_evaluate):
+    run_evaluate(
+        ["-m", "gpt2", "-d", "winograd", "-b", "10", "--dry_run", "-i", "5"],
+        cuda=0,
+        test_evaluation_data=test_evaluation_data,
     )
 
 
 def test_crows_pairs_dry_run(run_evaluate):
-    run_evaluate(["-m", "gpt2", "-d", "crows_pairs", "-b", "10", "--dry_run", "--cuda", "0"])
+    run_evaluate(
+        ["-m", "gpt2", "-d", "crows_pairs", "-b", "10", "--dry_run", "-i", "5"],
+        cuda=0,
+        test_evaluation_data=test_evaluation_data,
+    )
diff --git a/tests/dry_test/test_models.py b/tests/dry_test/test_models.py
index b4905bb5..7f748e27 100644
--- a/tests/dry_test/test_models.py
+++ b/tests/dry_test/test_models.py
@@ -1,4 +1,5 @@
 import pytest
+import torch
 
 from .fixtures import run_evaluate
 
@@ -12,9 +13,16 @@
 }
 
 
+# 3 (datasets) by 6 (models) grid
 @pytest.mark.parametrize("dataset", ["gsm8k", "hellaswag", "mmlu"])
 @pytest.mark.parametrize("model, extra_args", models.items())
 def test_models_dry_run(run_evaluate, model, dataset, extra_args):
+    if not torch.cuda.is_available() and extra_args[-2:] == ["--cuda", "0"]:
+        pytest.skip("CUDA is not available")
+
     if extra_args is None:
         return
-    run_evaluate(["-m", model, "-d", dataset, "-b", "10", "--dry_run"] + extra_args)
+    try:
+        run_evaluate(["-m", model, "-d", dataset, "-b", "10", "--dry_run"] + extra_args, cuda=0)
+    except torch.cuda.OutOfMemoryError:
+        pytest.skip(f"Out of memory error on {model} {dataset}")
diff --git a/tests/requirements-tests.txt b/tests/requirements-tests.txt
index 51553069..18438e04 100644
--- a/tests/requirements-tests.txt
+++ b/tests/requirements-tests.txt
@@ -2,6 +2,10 @@
 pytest
 pytest-md
 pytest-emoji
+pytest-cov
+pytest-github-actions-annotate-failures
+pytest-split
+pytest-rerunfailures
 packaging
 setuptools
 wheel
@@ -14,6 +18,8 @@ datasets>=2.16.1
 coloredlogs
 tqdm>=4.58.0
 jinja2
+accelerate
+# install vllm in pytest-check.yml with --no-build-isolation
 
 # API Models
 anthropic
diff --git a/tests/utilization/utils/test_batch_sampler.py b/tests/utilization/utils/test_batch_sampler.py
index a5ef4d45..70309330 100644
--- a/tests/utilization/utils/test_batch_sampler.py
+++ b/tests/utilization/utils/test_batch_sampler.py
@@ -4,8 +4,8 @@
 
 sys.path.append('.')
 from utilization.model.huggingface_model import HuggingFaceModel
+from utilization.model.model_utils.batch_sampler import AutoBatchSizeSampler, DatasetCollectionBatchSampler
 from utilization.utils.arguments import ModelArguments
-from utilization.utils.batch_sampler import AutoBatchSizeSampler, DatasetCollectionBatchSampler
 
 
 def test_auto_batch_sampler_auto_batching():
diff --git a/training/evol_instruct.py b/training/evol_instruct.py
index 68447a15..88ac1f6b 100644
--- a/training/evol_instruct.py
+++ b/training/evol_instruct.py
@@ -5,7 +5,7 @@
 
 from tqdm import tqdm
 
-from utilization.model.openai import Openai
+from utilization.model.openai_model import Openai
 from utilization.utils import ModelArguments
 
 base_instruction_breath = "I want you act as a Prompt Creator.\r\n\
diff --git a/utilization/README.md b/utilization/README.md
index 23ff2a78..1ee1329a 100644
--- a/utilization/README.md
+++ b/utilization/README.md
@@ -3,6 +3,8 @@
 # Utilization
 
 - [Utilization](#utilization)
+  - [Supported Datasets](#supported-datasets)
+  - [Customize Dataset](#customize-dataset)
   - [Usage](#usage)
     - [Model Arguments](#model-arguments)
     - [Dataset Arguments](#dataset-arguments)
@@ -12,10 +14,20 @@
     - [Customizing HuggingFace Models](#customizing-huggingface-models)
     - [Adding a New Model Provider](#adding-a-new-model-provider)
   - [Customize Chat Template](#customize-chat-template)
-  - [Supported Datasets](#supported-datasets)
-  - [Customize Dataset](#customize-dataset)
   - [Change Log](#change-log)
 
+
+## Supported Datasets
+
+- See a full list of supported datasets at [here](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/supported-datasets.md).
+- See how to [load datasets with subsets](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-load-datasets-with-subsets.md).
+- See how to [load datasets](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-load-datasets-from-huggingface.md) from Hugging Face or its mirror.
+
+## Customize Dataset
+
+See [this guide](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-customize-dataset.md) for details.
+
+
 ## Usage
 
 Evaluating davinci-002 on HellaSwag, with prefix caching and flash attention enabled by default:
@@ -41,7 +53,7 @@ python inference.py -m microsoft/phi-2 -d gsm8k -shots 8 --sample_num 100 --load
 Evaluating LLaMA-2 (7b) on CMMLU and CEval with instruction using vllm:
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python inference.py -m llama-2-7b-hf -d cmmlu ceval --vllm True --model_type instruction
+CUDA_VISIBLE_DEVICES=0 python inference.py -m llama-2-7b-hf -d cmmlu ceval --vllm True --model_type chat
 ```
 
 We use all cuda devices by default. You can specify the device with `CUDA_VISIBLE_DEVICES`.
@@ -50,7 +62,7 @@ We use all cuda devices by default. You can specify the device with `CUDA_VISIBL
 
 Define the model parameters, efficient evaluation settings, generation arguments, quantization, and additional configuration options.
 
-We provide an enumeration ([`enum`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/model/enum.py)) for models corresponding to each `model_backend`. If a model is not listed within this enumeration, `--model_backend` should be specified directly.
+We provide an enumeration ([`model_enum`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/model_enum.py)) for models corresponding to each `model_backend`. If a model is not listed within this enumeration, `--model_backend` should be specified directly.
 
 
 ```text
@@ -411,497 +423,9 @@ python inference.py -m gpt-3.5-turbo -d gsm8k --model_type chat -shots 8 -sys "Y
 You can customize the [chat template](https://github.com/RUCAIBox/LLMBox/blob/main/utilization/chat_templates.py) for local chat-based models. We provide a set of chat templates for different models. You can specify a jinja2 chat template with the `--chat_template` argument. It works in the same way as the [tokenizers](https://huggingface.co/docs/transformers/main/en/chat_templating).
 
 
-## Supported Datasets
-
-We currently support 53 commonly used datasets for LLMs. Each dataset may includes multiple subsets, or is a subset of a collection.
-
-Load from huggingface server:
-```bash
-python inference.py -d copa
-python inference.py -d race:middle,high
-python inference.py -d race:middle,high --evaluation_set "test[:10]" --example_set "train"
-```
-
-<table>
-  <tr>
-      <td><b>Dataset</b></td>
-      <td><b>Subsets / Collections</b></td>
-      <td><b>Evaluation Type</b></td>
-      <td><b>CoT</b></td>
-      <td><b>Notes</b></td>
-  </tr>
-  <tr>
-      <td rowspan=3>AGIEval(<code>agieval</code>, alias of <code>agieval_single_choice</code> and <code>agieval_cot</code>)</td>
-      <td><b>English</b>: <code>sat-en</code>, <code>sat-math</code>, <code>lsat-ar</code>, <code>lsat-lr</code>, <code>lsat-rc</code>, <code>logiqa-en</code>, <code>aqua-rat</code>, <code>sat-en-without-passage</code></td>
-      <td rowspan=2>MultipleChoice</td>
-      <td></td>
-      <td rowspan=3></td>
-  </tr>
-  <tr>
-      <td><code>gaokao-chinese</code>, <code>gaokao-geography</code>, <code>gaokao-history</code>, <code>gaokao-biology</code>, <code>gaokao-chemistry</code>, <code>gaokao-english</code>, <code>logiqa-zh</code></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td><code>jec-qa-kd</code>, <code>jec-qa-ca</code>, <code>math</code>, <code>gaokao-physics</code>, <code>gaokao-mathcloze</code>, <code>gaokao-mathqa</code></td>
-      <td>Generation</td>
-      <td>✅</td>
-  </tr>
-  <tr>
-      <td>Alpacal Eval (<code>alpaca_eval</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td>Single GPTEval</td>
-  </tr>
-  <tr>
-      <td>Adversarial Natural Language Inference (<code>anli</code>)</td>
-      <td><code>Round2</code> (default)</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>AI2's Reasoning Challenge (<code>arc</code>)</td>
-      <td><code>ARC-Easy</code>, <code>ARC-Challenge</code></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td>Normalization</td>
-  </tr>
-  <tr>
-      <td>BIG-Bench Hard (<code>bbh</code>)</td>
-      <td><code>boolean_expressions</code>, ...</td>
-      <td>Generation</td>
-      <td>✅</td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Boolean Questions (<code>boolq</code>)</td>
-      <td><i>super_glue</i></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>CommitmentBank (<code>cb</code>)</td>
-      <td><i>super_glue</i></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td rowspan=4>C-Eval (<code>ceval</code>)</td>
-      <td><b>stem</b>: <code>advanced_mathematics</code>, <code>college_chemistry</code>, ...</td>
-      <td rowspan=4>MultipleChoice</td>
-      <td rowspan=4></td>
-      <td rowspan=4></td>
-  </tr>
-  <tr>
-      <td><b>social science</b>: <code>business_administration</code>, <code>college_economics</code>, ...</td>
-  </tr>
-  <tr>
-      <td><b>humanities</b>: <code>art_studies</code>, <code>chinese_language_and_literature</code>, ...</td>
-  </tr>
-  <tr>
-      <td><b>other</b>: <code>accountant</code>, <code>basic_medicine</code>, ...</td>
-  </tr>
-  <tr>
-      <td rowspan=4>Massive Multitask Language Understanding in Chinese (<code>cmmlu</code>)</td>
-      <td><b>stem</b>: <code>anatomy</code>, <code>astronomy</code>, ...</td>
-      <td rowspan=4>MultipleChoice</td>
-      <td rowspan=4></td>
-      <td rowspan=4></td>
-  </tr>
-  <tr>
-      <td><b>social science</b>: <code>ancient_chinese</code>, <code>business_ethics</code>, ...</td>
-  </tr>
-  <tr>
-      <td><b>humanities</b>: <code>arts</code>, <code>chinese_history</code>, ...</td>
-  </tr>
-  <tr>
-      <td><b>other</b>: <code>agronomy</code>, <code>chinese_driving_rule</code>, ...</td>
-  </tr>
-  <tr>
-      <td>CNN Dailymail (<code>cnn_dailymail</code>)</td>
-      <td><code>3.0.0</code> (default), ...</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Reasoning About Colored Objects (<code>color_objects</code>)</td>
-      <td><i>bigbench</i> (reasoning_about_colored_objects)</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Commonsense QA (<code>commonsenseqa</code>)</td>
-      <td>/</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Choice Of Plausible Alternatives (<code>copa</code>)</td>
-      <td><i>super_glue</i></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Conversational Question Answering (<code>coqa</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td>Download: <a href="https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json">train</a>, <a href="https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json">dev</a></td>
-  </tr>
-  <tr>
-      <td>CrowS-Pairs (<code>crows_pairs</code>)</td>
-      <td>/</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Discrete Reasoning Over the content of Paragraphs (<code>drop</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td rowspan=3>GAOKAO (<code>gaokao</code>)</td>
-      <td><b>Chinese</b>: <code>2010-2022_Chinese_Modern_Lit</code>, <code>2010-2022_Chinese_Lang_and_Usage_MCQs</code></td>
-      <td rowspan=3>Generation</td>
-      <td rowspan=3></td>
-      <td rowspan=3>Metric: Exam scoring</td>
-  </tr>
-  <tr>
-      <td><b>English</b>: <code>2010-2022_English_Reading_Comp</code>, <code>2010-2022_English_Fill_in_Blanks</code>, ...</td>
-  </tr>
-  <tr>
-      <td><code>2010-2022_Math_II_MCQs</code>, <code>2010-2022_Math_I_MCQs</code>, ...</td>
-  </tr>
-  <tr>
-      <td>Google-Proof Q&A (<code>GPQA</code>)</td>
-      <td><code>gpqa_main</code> (default), <code>gpqa_extended</code>, ...</td>
-      <td>MultipleChoiceDataset</td>
-      <td>✅</td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Grade School Math 8K (<code>gsm8k</code>)</td>
-      <td><code>main</code> (default), <code>socratic</code></td>
-      <td>Generation</td>
-      <td>✅</td>
-      <td>Code exec</td>
-  </tr>
-  <tr>
-      <td>HaluEval(<code>halueval</code>)</td>
-      <td><code>dialogue_samples</code>, <code>qa_samples</code>, <code>summarization_samples</code></td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>HellaSWAG (<code>hellaswag</code>)</td>
-      <td>/</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>HumanEval (<code>humaneval</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td>Pass@K</td>
-  </tr>
-  <tr>
-      <td>Instruction-Following Evaluation (<code>ifeval</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>LAnguage Modeling Broadened to Account for Discourse Aspects (<code>lambada</code>)</td>
-      <td><code>default</code> (default), <code>de</code>, ... (source: <i>EleutherAI/lambada_openai</i>)</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Mathematics Aptitude Test of Heuristics (<code>math</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td>✅</td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Mostly Basic Python Problems (<code>mbpp</code>)</td>
-      <td><code>full</code> (default), <code>sanitized</code></td>
-      <td>Generation</td>
-      <td></td>
-      <td>Pass@K</td>
-  </tr>
-  <tr>
-      <td rowspan=4>Massive Multitask Language Understanding(<code>mmlu</code>)</td>
-      <td><b>stem</b>: <code>abstract_algebra</code>, <code>astronomy</code>, ...</td>
-      <td rowspan=4>MultipleChoice</td>
-      <td rowspan=4></td>
-      <td rowspan=4></td>
-  </tr>
-  <tr>
-      <td><b>social_sciences</b>: <code>econometrics</code>, <code>high_school_geography</code>, ...</td>
-  </tr>
-  <tr>
-      <td><b>humanities</b>: <code>formal_logic</code>, <code>high_school_european_history</code>, ...</td>
-  </tr>
-  <tr>
-      <td><b>other</b>: <code>anatomy</code>, <code>business_ethics</code>, ...</td>
-  </tr>
-  <tr>
-      <td>Multi-turn Benchmark (<code>mt_bench</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td>Multi-turn GPTEval</td>
-  </tr>
-  <tr>
-      <td>Natural Questions(<code>nq</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>OpenBookQA (<code>openbookqa</code>)</td>
-      <td><code>main</code> (default), <code>additional</code></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td>Normalization</td>
-  </tr>
-  <tr>
-      <td>Penguins In A Table (<code>penguins_in_a_table</code>)</td>
-      <td><i>bigbench</i></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Physical Interaction: Question Answering (<code>piqa</code>)</td>
-      <td>/</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Question Answering in Context (<code>quac</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>ReAding Comprehension (<code>race</code>)</td>
-      <td><code>high</code>, <code>middle</code></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td>Normalization</td>
-  </tr>
-  <tr>
-      <td>Real Toxicity Prompts (<code>real_toxicity_prompts</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td><a href="https://www.perspectiveapi.com/">Perspective</a> Toxicity</td>
-  </tr>
-  <tr>
-      <td>Recognizing Textual Entailment (<code>rte</code>)</td>
-      <td><i>super_glue</i></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Social Interaction QA (<code>siqa</code>)</td>
-      <td>/</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Stanford Question Answering Dataset (<code>squad, squad_v2</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Story Cloze Test (<code>story_cloze</code>)</td>
-      <td><code>2016</code> (default), <code>2018</code></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td><a href='http://goo.gl/forms/aQz39sdDrO'>Manually download</a></td>
-  </tr>
-  <tr>
-      <td>TL;DR (<code>tldr</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>TriviaQA (<code>triviaqa</code>)</td>
-      <td><code>rc.wikipedia.nocontext</code> (default), <code>rc</code>, <code>rc.nocontext</code>, ...</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>TruthfulQA (<code>truthfulqa_mc</code>)</td>
-      <td><code>multiple_choice</code> (default), <code>generation</code> (not supported)</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Vicuna Bench (<code>vicuna_bench</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td>GPTEval</td>
-  </tr>
-  <tr>
-      <td>WebQuestions (<code>webq</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Words in Context (<code>wic</code>)</td>
-      <td><i>super_glue</i></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Winogender Schemas (<code>winogender</code>)</td>
-      <td><code>main</code>, <code>gotcha</code></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td>Group by gender</td>
-  </tr>
-  <tr>
-      <td>WSC273 (<code>winograd</code>)</td>
-      <td><code>wsc273</code> (default), <code>wsc285</code></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>WinoGrande (<code>winogrande</code>)</td>
-      <td><code>winogrande_debiased</code> (default), ...</td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Conference on Machine Translation (<code>wmt21, wmt19, ...</code>)</td>
-      <td><code>en-ro</code>, <code>ro-en</code>, ...</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Winograd Schema Challenge (<code>wsc</code>)</td>
-      <td><i>super_glue</i></td>
-      <td>MultipleChoice</td>
-      <td></td>
-      <td></td>
-  </tr>
-  <tr>
-      <td>Extreme Summarization (<code>xsum</code>)</td>
-      <td>/</td>
-      <td>Generation</td>
-      <td></td>
-      <td></td>
-  </tr>
-
-</table>
-
-By default we load all the subsets of a dataset:
-
-```bash
-python inference.py -m model -d arc
-# equivalent: arc:ARC-Easy,ARC-Challenge
-```
-
-Unless a default subset is defined:
-
-```bash
-python inference.py -m model -d cnn_dailymail
-# equivalent: cnn_dailymail:3.0.0
-```
-
-Some datasets like GPQA (Google-Proof Q&A) have to load example set separately:
-
-```bash
-# few_shot
-python inference.py -m model -d gpqa --ranking_type generation -shots 5 --example_set "../gpqa/prompts"
-```
-
-If `dataset_path` is not None, the dataset will be loaded from the given local path:
-
-```bash
-# from a cloned directory of the huggingface dataset repository:
-python inference.py -d copa --dataset_path /path/to/copa
-
-# from a local (nested) directory saved by `dataset.save_to_disk`:
-python inference.py -d race --dataset_path /path/to/race/middle
-python inference.py -d race:middle --dataset_path /path/to/race
-python inference.py -d race:middle --dataset_path /path/to/race/middle
-python inference.py -d race:middle,high --dataset_path /path/to/race
-```
-
-`dataset_path` can also accept a dataset file or a directory containing these files (supports json, jsonl, csv, and txt):
-```bash
-# load one split from one subset only
-python inference.py -d gsm8k --dataset_path /path/to/gsm.jsonl
-python inference.py -d race --dataset_path /path/to/race/middle/train.json
-
-# load test and train splits from middle subset (a directory contains `/path/to/race/middle/train.json` and `/path/to/race/middle/test.json`)
-python inference.py -d race --dataset_path /path/to/race/middle --evaluation_set "test[:10]" --example_set "train"
-
-# load test and train splits from middle and high subsets (a nested directory)
-python inference.py -d race:middle,high --dataset_path /path/to/race --evaluation_set "test[:10]" --example_set "train"
-
-# load test and train splits from middle and high subsets with a filename pattern
-python inference.py -d race:middle,high --evaluation_set "test[:10]" --example_set "train" --dataset_path "/pattern/of/race_{subset}_{split}.json"
-python inference.py -d mmlu --evaluation_set val --example_set dev --dataset_path "/pattern/of/mmlu/{split}/{subset}_{split}.csv"
-```
-
----
-
-Also feel free to override this function if you want to load the dataset in a different way:
-
-```python
-from .utils import load_raw_dataset_from_file, get_raw_dataset_loader
-
-class MyDataset(Dataset):
-    def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_set):
-        self.evaluation_data = get_raw_dataset_loader(...)("test")
-        self.example_data = load_raw_dataset_from_file("examples.json")
-```
-
-## Customize Dataset
-
-See [`Customize Dataset`](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/customize-dataset.md) for details.
-
 ## Change Log
 
+- **June 6, 2024**: Refactor the codebase and add support for hf-mirror.
 - **May 24, 2024**: Chat format support including conversational few-shot and system prompts.
 - **May 10, 2024**: New instruction formatting using f-string and jinja2.
 - **May 7, 2024**: Bump openai and vllm version.
diff --git a/utilization/__init__.py b/utilization/__init__.py
index 32bc81ed..d3fe9a57 100644
--- a/utilization/__init__.py
+++ b/utilization/__init__.py
@@ -2,8 +2,51 @@
 
 # Disable download counts for transformers to accelerate
 os.environ["HF_UPDATE_DOWNLOAD_COUNTS"] = "FALSE"
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 
-from .evaluator import Evaluator
+# this file only initializes .utils modules to avoid early import of torch
 from .utils import DatasetArguments, EvaluationArguments, ModelArguments, parse_argument
 
-__all__ = ["Evaluator", "parse_argument", "ModelArguments", "DatasetArguments", "EvaluationArguments"]
+if TYPE_CHECKING:
+    from .evaluator import Evaluator
+
+
+def get_evaluator(
+    *,
+    model_args: "ModelArguments",
+    dataset_args: "DatasetArguments",
+    evaluation_args: Optional["EvaluationArguments"] = None,
+    initalize: bool = True,
+    load_hf_model: Optional[Callable] = None,
+    evaluation_data: Optional[List[Dict[str, Any]]] = None,
+    example_data: Optional[List[Dict[str, Any]]] = None,
+) -> "Evaluator":
+    from .evaluator import Evaluator
+
+    return Evaluator(
+        model_args=model_args,
+        dataset_args=dataset_args,
+        evaluation_args=evaluation_args,
+        initalize=initalize,
+        load_hf_model=load_hf_model,
+        evaluation_data=evaluation_data,
+        example_data=example_data,
+    )
+
+
+def register_dataset(name: str):
+    """Decorator to register a dataset class to the dataset registry."""
+
+    from .load_dataset import REGISTERY, _validate_dataset_class
+
+    def _register_dataset_class(cls):
+        assert _validate_dataset_class(cls), f"{cls} is not a valid dataset class."
+        REGISTERY[name] = cls
+        return cls
+
+    return _register_dataset_class
+
+
+__all__ = [
+    "get_evaluator", "parse_argument", "ModelArguments", "DatasetArguments", "EvaluationArguments", "register_dataset"
+]
diff --git a/utilization/chat_templates.py b/utilization/chat_templates.py
index 3a1f3cb6..30ade8a1 100644
--- a/utilization/chat_templates.py
+++ b/utilization/chat_templates.py
@@ -1,22 +1,42 @@
-from typing import Any, Dict, Union
+from typing import Any, Dict, List, Union
 
 __all__ = ["DEFAULT_CHAT_TEMPLATE", "DEFAULT_CHAT_CONFIGS"]
 
+
+def smart_space(parts: List[str], auto_leading_space) -> str:
+
+    def add_space(msg: str, auto_leading_space: bool, context: str) -> str:
+        if auto_leading_space and msg and context and not context[-1].isspace() and not msg[0].isspace():
+            return ' ' + msg
+        return msg
+
+    rendered = ""
+    for part in parts:
+        if part:
+            rendered += add_space(part, auto_leading_space, rendered)
+    return rendered
+
+
 # sources: https://github.com/huggingface/chat-ui/blob/main/PROMPTS.md
 
 DEFAULT_CHAT_TEMPLATE = (
-    "{% macro add(role, msg) -%}"
-    "{{ seq[role + '_start'] }}"
-    "{{ msg | smart_space(auto_leading_space, seq[role + '_start']) }}"
-    "{{ seq[role + '_end'] }}"
-    "{%- endmacro %}"
-    "{{ seq.get('all_start', '') }}"
-    "{% for message in messages %}"
-    "{{ add(message['role'], message['content']) }}"
-    "{% endfor %}"
-    "{% if add_generation_prompt %}"
-    "{{ seq['assistant_start'] }}"
-    "{% endif %}"
+    "{%- set data = namespace(parts=[]) -%}"
+    ""
+    "{%- if 'all_start' in seq -%}"
+    "{%- set data.parts = data.parts + [seq['all_start']] -%}"
+    "{%- endif -%}"
+    ""
+    "{%- for message in messages -%}"
+    "{%- set data.parts = data.parts + [seq[message['role'] + '_start']] -%}"
+    "{%- set data.parts = data.parts + [message['content']] -%}"
+    "{%- set data.parts = data.parts + [seq[message['role'] + '_end']] -%}"
+    "{%- endfor -%}"
+    ""
+    "{%- if add_generation_prompt -%}"
+    "{%- set data.parts = data.parts + [seq['assistant_start']] -%}"
+    "{%- endif -%}"
+    ""
+    "{{ data.parts | smart_space(auto_leading_space) }}"
 )
 
 # Chat configs format:
@@ -48,7 +68,8 @@
         "assistant_start": "",
         "assistant_end": "\n\n",
         "auto_leading_space": True,
-        "default_stops": ["\n"],
+        "final_rstrip": True,
+        "default_stops": [],
     },
     "llama2": {
         "all_start": "<s>[INST] ",
@@ -59,7 +80,7 @@
         "assistant_start": "",
         "assistant_end": " </s><s>[INST] ",
         "auto_leading_space": True,
-        "default_stops": [""],
+        "default_stops": [],
     },
     "chatml": {
         "system_start": "<|im_start|>system\n",
diff --git a/utilization/dataset/__init__.py b/utilization/dataset/__init__.py
index 25b2b356..ec2782c1 100644
--- a/utilization/dataset/__init__.py
+++ b/utilization/dataset/__init__.py
@@ -1 +1,3 @@
-from .load import load_datasets
+from .dataset import Dataset, DatasetCollection
+from .generation_dataset import GenerationDataset
+from .multiple_choice_dataset import MultipleChoiceDataset
diff --git a/utilization/dataset/agieval_cot.py b/utilization/dataset/agieval_cot.py
index 3bdea308..6715a52d 100644
--- a/utilization/dataset/agieval_cot.py
+++ b/utilization/dataset/agieval_cot.py
@@ -2,12 +2,12 @@
 from functools import cached_property
 from logging import getLogger
 
-from ..metric import Em
-from ..utils import math_equiv
-from .dataset_enum import (
+from ..dataset_enum import (
     AGIEVAL_EN_CLOZE_TASKS, AGIEVAL_EN_PROMPT_TASKS, AGIEVAL_EN_QA_TASKS, AGIEVAL_MULTI_ANSWERS_TASKS,
     AGIEVAL_NO_LETTER_CHOICE_TASKS, AGIEVAL_ZH_CLOZE_TASKS, AGIEVAL_ZH_PROMPT_TASKS, AGIEVAL_ZH_QA_TASKS
 )
+from ..metric import Em
+from ..utils import math_equiv
 from .generation_dataset import GenerationDataset
 
 logger = getLogger(__name__)
@@ -15,25 +15,28 @@
 # `options_text` does not follow the standard MultipleChoiceDataset format,
 # because there might be multiple correct answers in the AGIEval dataset.
 INSTRUCTIONS = {
-    "mcq_zh_nocot_zero_shot": "{passage}问题：{question} 选项：{options_text}\n答案：从A到{max_option_letter}，我们应选择",
-    "mcq_zh_nocot_few_shot": "问题. {passage} {question}\n从以下选项中选择：{options_text}\n答案是",
-    "mcq_zh_cot_zero_shot": "{passage}问题：{question} 选项：{options_text}\n答案：从A到{max_option_letter}，我们应选择什么？让我们逐步思考：",
-    "mcq_zh_cot_few_shot": "问题. {passage} {question}\n从以下选项中选择：{options_text}\n问题的解析：",
+    "mcq_zh_nocot_zero_shot":
+    "{{ passage if passage }}问题：{{ question }} 选项：{{ options_text }}\n答案：从A到{{ max_option_letter }}，我们应选择",
+    "mcq_zh_nocot_few_shot": "问题. {{ passage if passage }} {{ question }}\n从以下选项中选择：{{ options_text }}\n答案是",
+    "mcq_zh_cot_zero_shot":
+    "{{ passage if passage }}问题：{{ question }} 选项：{{ options_text }}\n答案：从A到{{ max_option_letter }}，我们应选择什么？让我们逐步思考：",
+    "mcq_zh_cot_few_shot": "问题. {{ passage if passage }} {{ question }}\n从以下选项中选择：{{ options_text }}\n问题的解析：",
     "mcq_en_nocot_zero_shot":
-    "{passage}Q: {question} Answer Choices: {options_text}\nA: Among A through {max_option_letter}, the answer is",
+    "{{ passage if passage }}Q: {{ question }} Answer Choices: {{ options_text }}\nA: Among A through {{ max_option_letter }}, the answer is",
     "mcq_en_nocot_few_shot":
-    "Question. {passage} {question}\Choose from the following options: {options_text}\nThe answer is therefore",
-    "mcq_en_cot_zero_shot": "{passage}Q: {question} Answer Choices: {options_text}\nLet's think step by step.",
+    "Question. {{ passage if passage }} {{ question }}\Choose from the following options: {{ options_text }}\nThe answer is therefore",
+    "mcq_en_cot_zero_shot":
+    "{{ passage if passage }}Q: {{ question }} Answer Choices: {{ options_text }}\nLet's think step by step.",
     "mcq_en_cot_few_shot":
-    "Question. {passage} {question}\nChoose from the following options: {options_text}\nExplanation for Problem:",
-    "gen_zh_nocot_zero_shot": "{passage}问题：{question}\n答案：",
-    "gen_zh_nocot_few_shot": "问题. {passage} {question}\n答案是",
-    "gen_zh_cot_zero_shot": "{passage}问题：{question}\n答案：让我们逐步思考",
-    "gen_zh_cot_few_shot": "问题. {passage} {question}\n问题的解析：",
-    "gen_en_nocot_zero_shot": "{passage}Q: {question}\nA: The answer is",
-    "gen_en_nocot_few_shot": "Question. {passage} {question}\nThe answer is therefore",
-    "gen_en_cot_zero_shot": "{passage}Q: {question}\nA: Let's think step by step",
-    "gen_en_cot_few_shot": "Question. {passage} {question}\nExplanation for Problem:",
+    "Question. {{ passage if passage }} {{ question }}\nChoose from the following options: {{ options_text }}\nExplanation for Problem:",
+    "gen_zh_nocot_zero_shot": "{{ passage if passage }}问题：{{ question }}\n答案：",
+    "gen_zh_nocot_few_shot": "问题. {{ passage if passage }} {{ question }}\n答案是",
+    "gen_zh_cot_zero_shot": "{{ passage if passage }}问题：{{ question }}\n答案：让我们逐步思考",
+    "gen_zh_cot_few_shot": "问题. {{ passage if passage }} {{ question }}\n问题的解析：",
+    "gen_en_nocot_zero_shot": "{{ passage if passage }}Q: {{ question }}\nA: The answer is",
+    "gen_en_nocot_few_shot": "Question. {{ passage if passage }} {{ question }}\nThe answer is therefore",
+    "gen_en_cot_zero_shot": "{{ passage if passage }}Q: {{ question }}\nA: Let's think step by step",
+    "gen_en_cot_few_shot": "Question. {{ passage if passage }} {{ question }}\nExplanation for Problem:",
 }
 
 TARGETS = {
diff --git a/utilization/dataset/agieval_single_choice.py b/utilization/dataset/agieval_single_choice.py
index 8b44dafd..20bffd23 100644
--- a/utilization/dataset/agieval_single_choice.py
+++ b/utilization/dataset/agieval_single_choice.py
@@ -2,7 +2,7 @@
 from functools import cached_property
 from logging import getLogger
 
-from .dataset_enum import AGIEVAL_SUBJECTS, AGIEVAL_ZH_PROMPT_TASKS
+from ..dataset_enum import AGIEVAL_SUBJECTS, AGIEVAL_ZH_PROMPT_TASKS
 from .multiple_choice_dataset import MultipleChoiceDataset
 
 logger = getLogger(__name__)
@@ -10,10 +10,12 @@
 uncleaned_label = re.compile(r"^(\([ABCDEFGHIJ]\)|[ABCDEFGHIJ] *\.) *")
 
 INSTRUCTIONS = {
-    "zh_zero_shot": "{passage}问题：{question}\n{options}答案：从A到{max_option_letter}，我们应选择",
-    "zh_few_shot": "问题. {passage} {question}\n{options}从以下选项中选择：",
-    "en_zero_shot": "{passage}Q: {question}\n{options}Answer: Among A through {max_option_letter}, the answer is",
-    "en_few_shot": "Question. {passage} {question}\n{options}Choose from the following options: ",
+    "zh_zero_shot": "{{ passage if passage }}问题：{{ question }}\n{{ options }}\n答案：从A到{{ max_option_letter }}，我们应选择",
+    "zh_few_shot": "问题. {{ passage if passage }} {{ question }}\n{{ options }}\n从以下选项中选择：",
+    "en_zero_shot":
+    "{{ passage if passage}}Q: {{question}}\n{{options}}\nAnswer: Among A through {{ max_option_letter }}, the answer is",
+    "en_few_shot":
+    "Question. {{ passage if passage }} {{ question }}\n{{ options }}\nChoose from the following options: ",
 }
 
 
diff --git a/utilization/dataset/bbh.py b/utilization/dataset/bbh.py
index df42bdd1..b15ab9c5 100644
--- a/utilization/dataset/bbh.py
+++ b/utilization/dataset/bbh.py
@@ -3,8 +3,8 @@
 from logging import getLogger
 from typing import List
 
+from ..dataset_enum import BBH_LETTER_CHOICE, BBH_NO_CHOICE
 from ..metric import Em
-from .dataset_enum import BBH_LETTER_CHOICE, BBH_NO_CHOICE
 from .generation_dataset import GenerationDataset
 
 logger = getLogger(__name__)
diff --git a/utilization/dataset/ceval.py b/utilization/dataset/ceval.py
index 5e555d97..b7207f99 100644
--- a/utilization/dataset/ceval.py
+++ b/utilization/dataset/ceval.py
@@ -1,7 +1,7 @@
 from functools import cached_property
 from logging import getLogger
 
-from .dataset_enum import CEVAL_SUBJECTS, CEVAL_TRANS
+from ..dataset_enum import CEVAL_SUBJECTS, CEVAL_TRANS
 from .multiple_choice_dataset import MultipleChoiceDataset
 
 logger = getLogger(__name__)
diff --git a/utilization/dataset/dataset.py b/utilization/dataset/dataset.py
index 162babbc..9b34b870 100644
--- a/utilization/dataset/dataset.py
+++ b/utilization/dataset/dataset.py
@@ -1,7 +1,7 @@
 import typing
 from collections import OrderedDict, defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from copy import copy
+from copy import deepcopy
 from functools import cached_property
 from itertools import chain, islice
 from logging import getLogger
@@ -13,16 +13,14 @@
 import pandas as pd
 import torch
 
-from utilization.utils import dynamic_stride_tqdm
-
-from ..metric.utils import avg_metrics
-from ..model.model_enum import ENDPOINT_ARGS
-from ..utils.batch_sampler import DatasetCollectionBatchSampler
-from ..utils.conversation import Conversation, ConversationFormatter
+from ..dataset_enum import GAOKAO_CHINESE_TASKS_SCORE, GAOKAO_ENGLISH_TASKS_SCORE, GAOKAO_TASKS_SCORE
+from ..metric.metric_utils import avg_metrics
+from ..model.model_utils import Conversation, ConversationFormatter, DatasetCollectionBatchSampler
+from ..model_enum import ENDPOINT_ARGS
+from ..utils.dynamic_stride_tqdm import dynamic_stride_tqdm
 from ..utils.log_results import PredictionWriter, log_final_results, repeat_iter
-from .dataset_enum import GAOKAO_CHINESE_TASKS_SCORE, GAOKAO_ENGLISH_TASKS_SCORE, GAOKAO_TASKS_SCORE
-from .icl_strategies import ape, global_entropy_ordering_strategy, knn_construct_examples
-from .utils import DatasetUtilMixin, get_raw_dataset_loader
+from .dataset_utils import DatasetUtilMixin, get_raw_dataset_loader
+from .dataset_utils.icl_strategies import ape, global_entropy_ordering_strategy, knn_construct_examples
 
 if typing.TYPE_CHECKING:
     # solve the circular import
@@ -76,7 +74,7 @@ class Dataset(torch.utils.data.Dataset, DatasetUtilMixin):
     example_set: Optional[str] = None
     r"""The example split of dataset. Example data will be automatically loaded if this is not None."""
 
-    load_args: Union[Tuple[str], Tuple[str, str], Tuple[()]] = ()
+    load_args: Union[Tuple[str], Tuple[str, str], Tuple[()], None] = None
     r"""Arguments for loading the dataset with huggingface `load_dataset`.
 
     Supported formats:
@@ -171,12 +169,12 @@ def __init__(
         self.evaluation_set = args.evaluation_set or self.evaluation_set
         self.example_set = args.example_set or self.example_set
         if self.max_num_shots:
-            if not self.example_set:
+            if not self.example_set and not example_data:
                 # example_set is not mandatory when `load_raw_dataset` is overriden
                 logger.warning(
                     f"Please provide the example set for dataset {self.display_name} to construct few-shot examples. You can ignore this warning if `load_raw_dataset` is correctly implemented."
                 )
-            elif "val" in self.example_set or "test" in self.example_set:
+            elif self.example_set and ("val" in self.example_set or "test" in self.example_set):
                 logger.warning(
                     f"Example set is used for constructing few-shot examples, but `{self.example_set}` seems to be an evaluation set."
                 )
@@ -344,7 +342,7 @@ def _init_arguments(self):
 
         self.init_arguments()
 
-        self._extra_model_args = copy(self.extra_model_args)
+        self._extra_model_args = deepcopy(self.extra_model_args)
 
         # apply chat template
         if self.conversation_formatter.default_stops:
@@ -548,10 +546,11 @@ def _format_instance(
                 raise ValueError(f"Key `{key}` is reserved for dataset extensions and cannot be used in the instance.")
             formatted_instance[key] = getattr(self, key)
 
-        if self.instruction_template.debug_info:
-            source = self.instruction_template.render(formatted_instance)
-        else:
-            source = self.instruction.format_map(formatted_instance)
+        if not isinstance(source, list):
+            if self.instruction_template.debug_info:
+                source = self.instruction_template.render(formatted_instance)
+            else:
+                source = self.instruction.format_map(formatted_instance)
 
         return {"source": source, "target": target, "options": options}
 
diff --git a/utilization/dataset/dataset_utils/__init__.py b/utilization/dataset/dataset_utils/__init__.py
new file mode 100644
index 00000000..b444eac7
--- /dev/null
+++ b/utilization/dataset/dataset_utils/__init__.py
@@ -0,0 +1,3 @@
+from .icl_strategies import ape, global_entropy_ordering_strategy, knn_construct_examples
+from .raw_dataset_loader import get_raw_dataset_loader
+from .util_mixin import DatasetUtilMixin
diff --git a/utilization/dataset/icl_strategies.py b/utilization/dataset/dataset_utils/icl_strategies.py
similarity index 99%
rename from utilization/dataset/icl_strategies.py
rename to utilization/dataset/dataset_utils/icl_strategies.py
index f9e673fc..18ece559 100644
--- a/utilization/dataset/icl_strategies.py
+++ b/utilization/dataset/dataset_utils/icl_strategies.py
@@ -4,7 +4,7 @@
 import torch
 from tqdm import tqdm
 
-from ..model import openai_model
+from ...model import openai_model
 
 
 def knn_construct_examples(instance_query, example_dataset, k):
diff --git a/utilization/dataset/utils.py b/utilization/dataset/dataset_utils/raw_dataset_loader.py
similarity index 63%
rename from utilization/dataset/utils.py
rename to utilization/dataset/dataset_utils/raw_dataset_loader.py
index d4bfa29e..33725f05 100644
--- a/utilization/dataset/utils.py
+++ b/utilization/dataset/dataset_utils/raw_dataset_loader.py
@@ -1,19 +1,12 @@
 import json
 import os
 import re
-from bisect import bisect_left, bisect_right
-from dataclasses import dataclass
 from importlib.machinery import SourceFileLoader
 from logging import getLogger
 from os.path import abspath, relpath
-from pprint import pformat
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import datasets as ds
-import tiktoken
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
-
-from utilization.utils.conversation import Conversation, ConversationFormatter
 
 logger = getLogger(__name__)
 
@@ -21,81 +14,15 @@
 slice_regex = re.compile(r"\[(\d*):(\d*)\]")
 
 
-@dataclass
-class DatasetUtilMixin:
-
-    answer_prompt: str = "Answer:"
-
-    def set_tokenizer(
-        self, tokenizer: Union[tiktoken.Encoding, PreTrainedTokenizer, PreTrainedTokenizerFast, None]
-    ) -> None:
-        self.tokenizer = tokenizer
-        if isinstance(tokenizer, tiktoken.Encoding):
-            # Encoding.encode_ordinary is slightly faster than Encoding.encode
-            self.tokenizer_encode = tokenizer.encode_ordinary
-        elif isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
-            self.tokenizer_encode = tokenizer.encode
-        if tokenizer is not None:
-            self.tokenizer_decode = tokenizer.decode
-
-    def _apply_normalization(self, conversations: List[Conversation]):
-        normalized_conversations = [Conversation.from_chat(assistant=conv[-1]["content"]) for conv in conversations]
-        conversations.extend(normalized_conversations)
-
-    def prompt_token_nums(self, prompt: str):
-        return len(self.tokenizer_encode(prompt))
-
-    def truncate_by_word(
-        self,
-        words: List[str],
-        max_tokens: int,
-        side: Literal["left", "right"],
-    ) -> Tuple[str, int, int]:
-        """Truncate the prompt by word to fit the maximum token length.
-
-        Return:
-            - prompt: the truncated prompt
-            - real_token_nums: the real token numbers of the truncated prompt
-            - word_nums: the number of words in the truncated prompt
-        """
-        lengths = [0]
-        for w in words:
-            lengths.append(lengths[-1] + len(w))
-        prompt = "".join(words)
-
-        tokens = self.tokenizer_encode(prompt)
-        real_token_nums = len(tokens)
-        if real_token_nums <= max_tokens:
-            return prompt, real_token_nums, len(words)
-
-        st = 0
-        ed = len(words)
-        if side == "left":
-            truncated_raw = self.tokenizer_decode(tokens[-max_tokens:])
-            st = bisect_left(lengths, len(prompt) - len(truncated_raw))
-        elif side == "right":
-            truncated_raw = self.tokenizer_decode(tokens[:max_tokens])
-            ed = bisect_right(lengths, len(truncated_raw)) - 1
-        prompt = "".join(words[st:ed])
-        real_token_nums = self.prompt_token_nums(prompt)
-        return prompt, real_token_nums, ed - st
-
-    def _log_instance(self, log: Callable, instance: Conversation, idx: int):
-        formatter = getattr(self, "conversation_formatter", None)
-        if isinstance(formatter, ConversationFormatter):
-            istr = formatter.apply_prompt_template(instance, add_generation_prompt=True)
-            log(f"Formatted evaluation instance {idx}\n" + pformat(istr, width=100))
-        else:
-            for i, seg in enumerate(instance):
-                log(f"Formatted evaluation instance {idx} ({seg['role']}_{i})\n" + pformat(seg["content"], width=100))
-
-
 def accepts_subset(
-    load_args: Union[Tuple[str], Tuple[str, str], Tuple[()]],
+    load_args: Union[Tuple[str], Tuple[str, str], Tuple[()], None],
     overwrite_subset: bool = True,
     subset: str = "",
     disable_warning: bool = False,
 ) -> bool:
+    if load_args is None:
+        return False
+
     if len(load_args) == 2 and isinstance(load_args[1], str):
         if overwrite_subset:
             if not disable_warning and load_args[1] != subset:
@@ -147,6 +74,7 @@ def get_raw_dataset_loader(
     - local file pattern `"{dataset_path}".format(subset=subset_name, split=split)`
 
     """
+
     if subset_name:
         dataset_msg = f"{dataset_name}:{subset_name}"
     else:
@@ -161,6 +89,9 @@ def get_raw_dataset_loader(
     else:
         load_kwargs["download_config"] = ds.DownloadConfig(use_etag=use_etag)
 
+    if "trust_remote_code" not in load_kwargs:
+        load_kwargs["trust_remote_code"] = True
+
     # if `dataset_path` is not None, load from local path
     if dataset_path is not None:
         dataset_path = abspath(dataset_path)
@@ -168,28 +99,38 @@ def get_raw_dataset_loader(
         if subset_name is None and len(load_args) > 1 and load_args[1] is not None:
             subset_name = load_args[1]
 
+        load_from_script = False
+        if os.path.isdir(dataset_path):
+            load_script = dataset_path + "/" + dataset_path.split("/")[-1].split("--")[-1] + ".py"
+            if os.path.exists(load_script):
+                load_from_script = True
+
+        if load_from_script:
+            logger.debug(f"Loading from a downloaded dataset: {load_script}, {subset_name}")
+
+            # hfd
+            def load_fn(split):
+                return ds.load_dataset(load_script, subset_name, split=split, **load_kwargs)
+
         # load from a cloned repository from huggingface
-        if os.path.exists(os.path.join(dataset_path, "dataset_infos.json")):
+        elif os.path.exists(os.path.join(dataset_path, "dataset_infos.json")):
             infos = json.load(open(os.path.join(dataset_path, "dataset_infos.json")))
 
-            # find the correct subset
+            # find the correct subset. e.g. copa
             if dataset_name in infos:
 
                 logger.debug(f"Loading from a cloned or cached repository: {dataset_path}, {dataset_name}")
 
                 def load_fn(split):
-                    return ds.load_dataset(
-                        dataset_path, dataset_name, split=split, trust_remote_code=True, **load_kwargs
-                    )
+                    return ds.load_dataset(dataset_path, dataset_name, split=split, **load_kwargs)
 
-            elif subset_name in infos:
+            # e.g. hellaswag
+            elif subset_name in infos or (subset_name is None and "default" in infos):
 
                 logger.debug(f"Loading from a cloned or cached repository: {dataset_path}, {subset_name}")
 
                 def load_fn(split):
-                    return ds.load_dataset(
-                        dataset_path, subset_name, split=split, trust_remote_code=True, **load_kwargs
-                    )
+                    return ds.load_dataset(dataset_path, subset_name, split=split, **load_kwargs)
 
             else:
                 raise ValueError(
@@ -201,7 +142,7 @@ def load_fn(split):
             logger.debug(f"Loading from a cloned or cached repository: {dataset_path}, {subset_name}")
 
             def load_fn(split):
-                return ds.load_dataset(dataset_path, "default", split=split, trust_remote_code=True, **load_kwargs)
+                return ds.load_dataset(dataset_path, "default", split=split, **load_kwargs)
 
         # load from a local directory
         elif os.path.exists(os.path.join(dataset_path, "dataset_dict.json")):
@@ -226,51 +167,31 @@ def load_fn(split):
         elif os.path.isdir(dataset_path):
 
             offline = os.environ.get("HF_DATASETS_OFFLINE") == "1"
-            if os.path.exists(dataset_path + "/" + dataset_path.split("/")[-1] + ".py"):
-                offline = True
 
-            if ".cache" in dataset_path:
+            if offline:
 
-                _path = load_args[0] if len(load_args) >= 1 else dataset_name
-                logger.debug(f"Loading from a cache dir: {_path}, {subset_name}")
+                logger.debug(f"Loading from a downloaded dataset: {dataset_path}, default")
 
+                # example command: ceval (cloned from huggingface repo, in .csv format)
                 def load_fn(split):
                     return ds.load_dataset(
-                        _path, subset_name, split=split, cache_dir=dataset_path, trust_remote_code=True, **load_kwargs
+                        dataset_path, "default", split=split, data_dir=relpath(dataset_path), **load_kwargs
                     )
-            elif offline:
 
-                config_name = subset_name
-                if not os.path.exists(dataset_path + "/" + dataset_path.split("/")[-1] + ".py"):
-                    config_name = "default"
+            elif ".cache" in dataset_path:
 
-                logger.debug(f"Loading from a downloaded dataset: {dataset_path}, {config_name}")
+                _path = load_args[0] if len(load_args) >= 1 else dataset_name
+                logger.debug(f"Loading from a cache dir: {_path}, {subset_name}")
 
-                # example command: ceval (cloned from huggingface repo, in .csv format)
                 def load_fn(split):
-                    return ds.load_dataset(
-                        dataset_path,
-                        config_name,
-                        split=split,
-                        data_dir=relpath(dataset_path),
-                        download_config=download_config,
-                        trust_remote_code=True,
-                    )
-
+                    return ds.load_dataset(_path, subset_name, split=split, cache_dir=dataset_path, **load_kwargs)
             else:
 
                 logger.debug(f"Loading from a manually-downloaded dataset: {dataset_path}, {subset_name}")
 
                 # example command: story_cloze
                 def load_fn(split):
-                    return ds.load_dataset(
-                        dataset_name,
-                        subset_name,
-                        split=split,
-                        data_dir=dataset_path,
-                        download_config=download_config,
-                        trust_remote_code=True,
-                    )
+                    return ds.load_dataset(dataset_name, subset_name, split=split, data_dir=dataset_path, **load_kwargs)
 
         # load from a file
         else:
@@ -317,18 +238,17 @@ def load_fn(split):
         msg += f" from huggingface ({', '.join(load_args)})"
 
         def load_fn(split):
-            return ds.load_dataset(*load_args, split=split, trust_remote_code=True, **load_kwargs)
-
-    if load_fn is None:
-        raise ValueError(
-            f"Failed to load dataset `{dataset_msg}`. Please check if the dataset exists in huggingface or local path."
-        )
+            return ds.load_dataset(*load_args, split=split, **load_kwargs)
 
     def informative_load_fn(split=None) -> ds.Dataset:
         try:
             return load_fn(split=split)
         except KeyError as e:
             raise ValueError(f"Cannot find split `{split}` in `{dataset_msg}`.") from e
+        except TypeError as e:
+            raise ValueError(
+                f"Failed to load dataset `{dataset_msg}`. Please check if the dataset exists in huggingface or local path."
+            ) from e
 
     if return_msg:
         return informative_load_fn, msg
diff --git a/utilization/dataset/dataset_utils/util_mixin.py b/utilization/dataset/dataset_utils/util_mixin.py
new file mode 100644
index 00000000..015bca8b
--- /dev/null
+++ b/utilization/dataset/dataset_utils/util_mixin.py
@@ -0,0 +1,78 @@
+from bisect import bisect_left, bisect_right
+from dataclasses import dataclass
+from pprint import pformat
+from typing import Callable, List, Literal, Tuple, Union
+
+import tiktoken
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from ...model.model_utils.conversation import Conversation, ConversationFormatter
+
+
+@dataclass
+class DatasetUtilMixin:
+
+    answer_prompt: str = "Answer:"
+
+    def set_tokenizer(
+        self, tokenizer: Union[tiktoken.Encoding, PreTrainedTokenizer, PreTrainedTokenizerFast, None]
+    ) -> None:
+        self.tokenizer = tokenizer
+        if isinstance(tokenizer, tiktoken.Encoding):
+            # Encoding.encode_ordinary is slightly faster than Encoding.encode
+            self.tokenizer_encode = tokenizer.encode_ordinary
+        elif isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+            self.tokenizer_encode = tokenizer.encode
+        if tokenizer is not None:
+            self.tokenizer_decode = tokenizer.decode
+
+    def _apply_normalization(self, conversations: List[Conversation]):
+        normalized_conversations = [Conversation.from_chat(assistant=conv[-1]["content"]) for conv in conversations]
+        conversations.extend(normalized_conversations)
+
+    def prompt_token_nums(self, prompt: str):
+        return len(self.tokenizer_encode(prompt))
+
+    def truncate_by_word(
+        self,
+        words: List[str],
+        max_tokens: int,
+        side: Literal["left", "right"],
+    ) -> Tuple[str, int, int]:
+        """Truncate the prompt by word to fit the maximum token length.
+
+        Return:
+            - prompt: the truncated prompt
+            - real_token_nums: the real token numbers of the truncated prompt
+            - word_nums: the number of words in the truncated prompt
+        """
+        lengths = [0]
+        for w in words:
+            lengths.append(lengths[-1] + len(w))
+        prompt = "".join(words)
+
+        tokens = self.tokenizer_encode(prompt)
+        real_token_nums = len(tokens)
+        if real_token_nums <= max_tokens:
+            return prompt, real_token_nums, len(words)
+
+        st = 0
+        ed = len(words)
+        if side == "left":
+            truncated_raw = self.tokenizer_decode(tokens[-max_tokens:])
+            st = bisect_left(lengths, len(prompt) - len(truncated_raw))
+        elif side == "right":
+            truncated_raw = self.tokenizer_decode(tokens[:max_tokens])
+            ed = bisect_right(lengths, len(truncated_raw)) - 1
+        prompt = "".join(words[st:ed])
+        real_token_nums = self.prompt_token_nums(prompt)
+        return prompt, real_token_nums, ed - st
+
+    def _log_instance(self, log: Callable, instance: Conversation, idx: int):
+        formatter = getattr(self, "conversation_formatter", None)
+        if isinstance(formatter, ConversationFormatter):
+            istr = formatter.apply_prompt_template(instance, add_generation_prompt=True)
+            log(f"Formatted evaluation instance {idx}\n" + pformat(istr, width=100))
+        else:
+            for i, seg in enumerate(instance):
+                log(f"Formatted evaluation instance {idx} ({seg['role']}_{i})\n" + pformat(seg["content"], width=100))
diff --git a/utilization/dataset/drop.py b/utilization/dataset/drop.py
index 5a37b406..86902bfa 100644
--- a/utilization/dataset/drop.py
+++ b/utilization/dataset/drop.py
@@ -27,7 +27,7 @@ class Drop(GenerationDataset):
     instruction = "Answer the question based on the given passage.\n\nPassage: {passage}\nQuestion: {question}\nAnswer:"
     example_set = "train"
     evaluation_set = "validation"
-    load_args = ("EleutherAI/drop",)
+    load_args = ("ucinlp/drop",)
     load_kwargs = {"download_config": ds.DownloadConfig(extract_compressed_file=True)}
     metrics = [F1(force_number_match=True, word_tokenize="regex", align_bag="counter"), Em()]
     extra_model_args = dict(max_tokens=64, temperature=0, stop=["\n"])
diff --git a/utilization/dataset/gaokao.py b/utilization/dataset/gaokao.py
index 8c7759dd..966d763f 100644
--- a/utilization/dataset/gaokao.py
+++ b/utilization/dataset/gaokao.py
@@ -2,8 +2,8 @@
 from functools import cached_property
 from logging import getLogger
 
+from ..dataset_enum import GAOKAO_TASKS
 from ..metric import Gaokao_bench_metric
-from .dataset_enum import GAOKAO_TASKS
 from .generation_dataset import GenerationDataset
 
 logger = getLogger(__name__)
diff --git a/utilization/dataset/generation_dataset.py b/utilization/dataset/generation_dataset.py
index f25532cb..a3d47342 100644
--- a/utilization/dataset/generation_dataset.py
+++ b/utilization/dataset/generation_dataset.py
@@ -7,4 +7,3 @@ class GenerationDataset(Dataset):
     r"""The dataset for Generation problems. It solves problems in natural language and is evaluated using `accuracy` score."""
 
     evaluation_type = "generation"
-    metric = "accuracy"
diff --git a/utilization/dataset/mmlu.py b/utilization/dataset/mmlu.py
index 1078a937..fdf265ab 100644
--- a/utilization/dataset/mmlu.py
+++ b/utilization/dataset/mmlu.py
@@ -1,7 +1,7 @@
 from functools import cached_property
 from logging import getLogger
 
-from .dataset_enum import MMLU_SUBJECTS
+from ..dataset_enum import MMLU_SUBJECTS
 from .multiple_choice_dataset import MultipleChoiceDataset
 
 logger = getLogger(__name__)
diff --git a/utilization/dataset/squad.py b/utilization/dataset/squad.py
index 876aa60d..aa63ced4 100644
--- a/utilization/dataset/squad.py
+++ b/utilization/dataset/squad.py
@@ -10,7 +10,7 @@
 
 
 class Squad(GenerationDataset):
-    """The dataset of Squad and Squad_v2.
+    """The dataset of Squad.
 
     Gcombines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones.
 
@@ -23,7 +23,7 @@ class Squad(GenerationDataset):
     instruction = 'Answer each question using information in the preceding background paragraph.\nIf there is not enough information provided, answer with "Not in background."\n\nTitle: {title}\nBackground: {context}\n\nQ: {question}\n\nA:'
     example_set = "train"
     evaluation_set = "validation"
-    load_args = ()  # in order to support squad_v2, load_args is set in load.py
+    load_args = ("squad",)
     metrics = [F1(word_tokenize="split"), Em()]
     extra_model_args = dict(max_tokens=64, temperature=0, stop=["\n"])
 
diff --git a/utilization/dataset/squad_v2.py b/utilization/dataset/squad_v2.py
new file mode 100644
index 00000000..0d8a48eb
--- /dev/null
+++ b/utilization/dataset/squad_v2.py
@@ -0,0 +1,15 @@
+from .squad import Squad as SquadDataset
+
+
+class SquadV2(SquadDataset):
+    """The dataset of SquadV2.
+
+    Gcombines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones.
+
+    Examples:
+        context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
+        question: In what country is Normandy located?
+        answer: ['France', 'France', 'France', 'France']
+    """
+
+    load_args = ("squad_v2",)
diff --git a/utilization/dataset/translation_dataset.py b/utilization/dataset/translation_dataset.py
index c96c7854..234216de 100644
--- a/utilization/dataset/translation_dataset.py
+++ b/utilization/dataset/translation_dataset.py
@@ -14,7 +14,7 @@ class TranslationDataset(GenerationDataset):
         reference: Obama welcomes Netanyahu
     """
 
-    instruction = "Q: Translate to {{lang}}. {{translation[self.subset_name[:2]]}}\nA:"
+    instruction = "Q: Translate to {{lang}}. {{translation[subset_name[:2]]}}\nA:"
     evaluation_set = "test"
     example_set = "train"
     metrics = [Bleu()]
@@ -27,7 +27,7 @@ def init_arguments(self):
 
     def format_instance(self, instance):
         instance["lang"] = self.language
-        instance["target"] = instance[self.subset_name[3:5]]
+        instance["target"] = instance["translation"][self.subset_name[3:5]]
         return instance
 
     @staticmethod
diff --git a/utilization/dataset/dataset_enum.py b/utilization/dataset_enum.py
similarity index 99%
rename from utilization/dataset/dataset_enum.py
rename to utilization/dataset_enum.py
index a68b97db..49ee0e87 100644
--- a/utilization/dataset/dataset_enum.py
+++ b/utilization/dataset_enum.py
@@ -8,7 +8,6 @@
 
 DATASET_ALIASES = {
     "agieval": ["agieval_single_choice", "agieval_cot"],  # try to use MultipleChoiceDataset first
-    "squad_v2": ["squad"],
 }
 
 for wmt in WMT_DATASETS:
diff --git a/utilization/evaluator.py b/utilization/evaluator.py
index a8d48d92..bc1ec407 100644
--- a/utilization/evaluator.py
+++ b/utilization/evaluator.py
@@ -1,13 +1,12 @@
 from logging import getLogger
 from statistics import mode
-from typing import Callable, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional
 
-from torch.utils.data import DataLoader
-
-from .dataset import load_datasets
-from .model import load_model
-from .utils import DatasetArguments, EvaluationArguments, ModelArguments, catch_error, dynamic_stride_tqdm
-from .utils.arguments import check_args
+from .load_dataset import load_datasets
+from .load_model import load_model
+from .utils.arguments import DatasetArguments, EvaluationArguments, ModelArguments, check_args
+from .utils.catch_error import catch_error
+from .utils.dynamic_stride_tqdm import dynamic_stride_tqdm
 from .utils.log_results import PredictionWriter
 from .utils.logging import set_logging
 from .utils.random import set_seed
@@ -35,6 +34,8 @@ def __init__(
         evaluation_args: Optional[EvaluationArguments] = None,
         initalize: bool = True,
         load_hf_model: Optional[Callable] = None,
+        evaluation_data: Optional[List[Dict[str, Any]]] = None,
+        example_data: Optional[List[Dict[str, Any]]] = None,
     ):
 
         self.model_args = model_args
@@ -53,10 +54,16 @@ def __init__(
 
         self.model = load_model(self.model_args)
         self.writer = PredictionWriter(self.dataset_args.evaluation_results_path)
-        self.dataset = load_datasets(self.dataset_args, self.model)
+        self.dataset = load_datasets(
+            self.dataset_args,
+            self.model,
+            self.evaluation_args,
+            evaluation_data=evaluation_data,
+            example_data=example_data,
+        )
         self.writer.write_metainfo(self.model_args, self.dataset_args, self.evaluation_args)
 
-    @catch_error(continue_from=True)
+    @catch_error(True)
     def evaluate(self) -> Dict[str, Dict[str, float]]:
         r"""It conducts the evaluation on the dataset with corresponding models.
         We support two evaluation types:
@@ -66,6 +73,8 @@ def evaluate(self) -> Dict[str, Dict[str, float]]:
 
         Finally, we call the `calculate_metric` to get the metric score of prediction results.
         """
+        from torch.utils.data import DataLoader
+
         if self.evaluation_args.dry_run:
             self.model.get_ppl = lambda x: [(0, 1)] * len(x)
             self.model.generation = lambda x: [""] * len(x)
diff --git a/utilization/hfd.sh b/utilization/hfd.sh
new file mode 100644
index 00000000..afa14f4e
--- /dev/null
+++ b/utilization/hfd.sh
@@ -0,0 +1,164 @@
+#!/usr/bin/env bash
+# Source: https://gist.github.com/padeoe/697678ab8e528b85a2a7bddafea1fa4f
+# Color definitions
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+trap 'printf "${YELLOW}\nDownload interrupted. If you re-run the command, you can resume the download from the breakpoint.\n${NC}"; exit 1' INT
+
+display_help() {
+    cat << EOF
+Usage:
+  hfd <repo_id> [--include include_pattern] [--exclude exclude_pattern] [--hf_username username] [--hf_token token] [--tool aria2c|wget] [-x threads] [--dataset] [--local-dir path]
+
+Description:
+  Downloads a model or dataset from Hugging Face using the provided repo ID.
+
+Parameters:
+  repo_id        The Hugging Face repo ID in the format 'org/repo_name'.
+  --include       (Optional) Flag to specify a string pattern to include files for downloading.
+  --exclude       (Optional) Flag to specify a string pattern to exclude files from downloading.
+  include/exclude_pattern The pattern to match against filenames, supports wildcard characters. e.g., '--exclude *.safetensor', '--include vae/*'.
+  --hf_username   (Optional) Hugging Face username for authentication. **NOT EMAIL**.
+  --hf_token      (Optional) Hugging Face token for authentication.
+  --tool          (Optional) Download tool to use. Can be wget (default) or aria2c.
+  -x              (Optional) Number of download threads for aria2c. Defaults to 4.
+  --dataset       (Optional) Flag to indicate downloading a dataset.
+  --local-dir     (Optional) Local directory path where the model or dataset will be stored.
+  --mirror        (Optional) Force use hf-mirror
+
+Example:
+  hfd bigscience/bloom-560m --exclude *.safetensors
+  hfd meta-llama/Llama-2-7b --hf_username myuser --hf_token mytoken -x 4
+  hfd lavita/medical-qa-shared-task-v1-toy --dataset
+EOF
+    exit 1
+}
+
+MODEL_ID=$1
+shift
+
+# Default values
+TOOL="wget"
+THREADS=4
+HF_ENDPOINT=${HF_ENDPOINT:-"https://huggingface.co"}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --include) INCLUDE_PATTERN="$2"; shift 2 ;;
+        --exclude) EXCLUDE_PATTERN="$2"; shift 2 ;;
+        --hf_username) HF_USERNAME="$2"; shift 2 ;;
+        --hf_token) HF_TOKEN="$2"; shift 2 ;;
+        --tool) TOOL="$2"; shift 2 ;;
+        -x) THREADS="$2"; shift 2 ;;
+        --dataset) DATASET=1; shift ;;
+        --local-dir) LOCAL_DIR="$2"; shift 2 ;;
+        --mirror) HF_ENDPOINT="https://hf-mirror.com"; shift ;;
+        *) shift ;;
+    esac
+done
+
+
+# Check if aria2, wget, curl, git, and git-lfs are installed
+check_command() {
+    if ! command -v $1 &>/dev/null; then
+        echo -e "${RED}$1 is not installed. Please install it first.${NC}"
+        exit 1
+    fi
+}
+
+# Mark current repo safe when using shared file system like samba or nfs
+ensure_ownership() {
+    if git status 2>&1 | grep "fatal: detected dubious ownership in repository at" > /dev/null; then
+        git config --global --add safe.directory "${PWD}"
+        printf "${YELLOW}Detected dubious ownership in repository, mark ${PWD} safe using git, edit ~/.gitconfig if you want to reverse this.\n${NC}"
+    fi
+}
+
+[[ "$TOOL" == "aria2c" ]] && check_command aria2c
+[[ "$TOOL" == "wget" ]] && check_command wget
+check_command curl; check_command git; check_command git-lfs
+
+[[ -z "$MODEL_ID" || "$MODEL_ID" =~ ^-h ]] && display_help
+
+if [[ -z "$LOCAL_DIR" ]]; then
+    LOCAL_DIR="${MODEL_ID#*/}"
+fi
+
+if [[ "$DATASET" == 1 ]]; then
+    MODEL_ID="datasets/$MODEL_ID"
+fi
+echo "Downloading to $LOCAL_DIR"
+
+if [ -d "$LOCAL_DIR/.git" ]; then
+    printf "${YELLOW}%s exists, Skip Clone.\n${NC}" "$LOCAL_DIR"
+    cd "$LOCAL_DIR" && ensure_ownership && GIT_LFS_SKIP_SMUDGE=1 git pull || { printf "${RED}Git pull failed.${NC}\n"; exit 1; }
+else
+    REPO_URL="$HF_ENDPOINT/$MODEL_ID"
+    GIT_REFS_URL="${REPO_URL}/info/refs?service=git-upload-pack"
+    echo "Testing GIT_REFS_URL: $GIT_REFS_URL"
+    response=$(curl -s -o /dev/null -w "%{http_code}" "$GIT_REFS_URL")
+    if [ "$response" == "401" ] || [ "$response" == "403" ]; then
+        if [[ -z "$HF_USERNAME" || -z "$HF_TOKEN" ]]; then
+            printf "${RED}HTTP Status Code: $response.\nThe repository requires authentication, but --hf_username and --hf_token is not passed. Please get token from https://huggingface.co/settings/tokens.\nExiting.\n${NC}"
+            exit 1
+        fi
+        REPO_URL="https://$HF_USERNAME:$HF_TOKEN@${HF_ENDPOINT#https://}/$MODEL_ID"
+    elif [ "$response" != "200" ]; then
+        printf "${RED}Unexpected HTTP Status Code: $response\n${NC}"
+        printf "${YELLOW}Executing debug command: curl -v %s\nOutput:${NC}\n" "$GIT_REFS_URL"
+        curl -v "$GIT_REFS_URL"; printf "\n${RED}Git clone failed.\n${NC}"; exit 1
+    fi
+    echo "GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR"
+
+    GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR && cd "$LOCAL_DIR" || { printf "${RED}Git clone failed.\n${NC}"; exit 1; }
+
+    ensure_ownership
+
+    while IFS= read -r file; do
+        truncate -s 0 "$file"
+    done <<< $(git lfs ls-files | cut -d ' ' -f 3-)
+fi
+
+printf "\nStart Downloading lfs files, bash script:\ncd $LOCAL_DIR\n"
+files=$(git lfs ls-files | cut -d ' ' -f 3-)
+declare -a urls
+
+while IFS= read -r file; do
+    if [ ! $file ]; then
+        continue
+    fi
+    url="$HF_ENDPOINT/$MODEL_ID/resolve/main/$file"
+    file_dir=$(dirname "$file")
+    mkdir -p "$file_dir"
+    if [[ "$TOOL" == "wget" ]]; then
+        download_cmd="wget -c \"$url\" -O \"$file\""
+        [[ -n "$HF_TOKEN" ]] && download_cmd="wget --header=\"Authorization: Bearer ${HF_TOKEN}\" -c \"$url\" -O \"$file\""
+    else
+        download_cmd="aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
+        [[ -n "$HF_TOKEN" ]] && download_cmd="aria2c --header=\"Authorization: Bearer ${HF_TOKEN}\" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
+    fi
+    [[ -n "$INCLUDE_PATTERN" && ! "$file" == $INCLUDE_PATTERN ]] && printf "# %s\n" "$download_cmd" && continue
+    [[ -n "$EXCLUDE_PATTERN" && "$file" == $EXCLUDE_PATTERN ]] && printf "# %s\n" "$download_cmd" && continue
+    printf "%s\n" "$download_cmd"
+    urls+=("$url|$file")
+done <<< "$files"
+
+for url_file in "${urls[@]}"; do
+    IFS='|' read -r url file <<< "$url_file"
+    if [[ -n "$file" ]]; then
+        continue
+    fi
+    printf "${YELLOW}Start downloading ${file}.\n${NC}"
+    file_dir=$(dirname "$file")
+    if [[ "$TOOL" == "wget" ]]; then
+        [[ -n "$HF_TOKEN" ]] && wget --header="Authorization: Bearer ${HF_TOKEN}" -c "$url" -O "$file" || wget -c "$url" -O "$file"
+    else
+        [[ -n "$HF_TOKEN" ]] && aria2c --header="Authorization: Bearer ${HF_TOKEN}" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")" || aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")"
+    fi
+    [[ $? -eq 0 ]] && printf "Downloaded %s successfully.\n" "$url" || { printf "${RED}Failed to download %s.\n${NC}" "$url"; exit 1; }
+done
+
+printf "${GREEN}Download completed successfully.\n${NC}"
\ No newline at end of file
diff --git a/utilization/dataset/load.py b/utilization/load_dataset.py
similarity index 66%
rename from utilization/dataset/load.py
rename to utilization/load_dataset.py
index c1f7a5d0..824975be 100644
--- a/utilization/dataset/load.py
+++ b/utilization/load_dataset.py
@@ -4,52 +4,60 @@
 import logging
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from itertools import zip_longest
 from logging import getLogger
-from typing import TYPE_CHECKING, Dict, Iterator, List, Set, Type
+from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Type
 
 from datasets import DownloadConfig, get_dataset_config_names
 
-from utilization.metric.pass_at_k import PassAtK
-
-from ..metric import GPTEval
-from ..utils.catch_error import catch_error
-from ..utils.logging import list_datasets
 from .dataset import Dataset, DatasetCollection
+from .dataset.dataset_utils.raw_dataset_loader import accepts_subset
 from .dataset_enum import DATASET_ALIASES
-from .utils import accepts_subset
+from .metric import GPTEval, PassAtK
+from .utils.catch_error import catch_error
+from .utils.hfd import get_script_path, huggingface_download
+from .utils.logging import list_datasets
 
 if TYPE_CHECKING:
     # solve the circular import
-    from ..model.model import Model
-    from ..utils import DatasetArguments
+    from .model import Model
+    from .utils import DatasetArguments, EvaluationArguments
 
 logger = getLogger(__name__)
 
-ABSTRACT_DATASET = {"Dataset", "GenerationDataset", "MultipleChoiceDataset"}
+__all__ = ["load_datasets", "register_dataset"]
+
+ABSTRACT_DATASET = {"Dataset", "GenerationDataset", "MultipleChoiceDataset", "SquadDataset"}
+REGISTERY = {}
+
+
+def _validate_dataset_class(cls):
+    name = cls.__name__
+    return issubclass(cls, Dataset) and name not in ABSTRACT_DATASET
+
+
+def _fuzzy_match_prompt(dataset_name) -> str:
+    all_datasets = list_datasets()
+    matches = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6)
+    if len(matches) == 0:
+        fuzzy_match = f" Available choices are: {all_datasets}."
+    else:
+        fuzzy_match = f" Possible choices are: {matches}."
+    return fuzzy_match
 
 
 def _import_dataset_class(dataset_name: str) -> Type[Dataset]:
 
-    module_path = __package__ + "." + dataset_name
+    module_path = __package__ + ".dataset." + dataset_name
     try:
         module = importlib.import_module(module_path)
     except ModuleNotFoundError as e:
-        all_datasets = list_datasets()
-
-        if f"utilization.dataset.{dataset_name}" in str(e):
-            matches = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6)
-            if len(matches) == 0:
-                fuzzy_match = f" Available choices are: {all_datasets}."
-            else:
-                fuzzy_match = f" Possible choices are: {matches}."
-        else:
-            fuzzy_match = ""
-
+        fuzzy_match = _fuzzy_match_prompt(dataset_name) if f"utilization.dataset.{dataset_name}" in str(e) else ""
         raise ValueError(f"Invalid dataset: {dataset_name}.{fuzzy_match}\n{e}") from e
     clsmembers = inspect.getmembers(module, inspect.isclass)
 
     for name, obj in clsmembers:
-        if issubclass(obj, Dataset) and name not in ABSTRACT_DATASET:
+        if _validate_dataset_class(obj):
             logger.debug(f"Dataset class `{name}` imported from `{module_path}`.")
             return obj
 
@@ -60,7 +68,16 @@ def _import_dataset_class(dataset_name: str) -> Type[Dataset]:
 
 
 def import_dataset_classes(dataset_name: str) -> List[Type[Dataset]]:
-    if dataset_name in DATASET_ALIASES:
+    """Import dataset classes from the dataset_name. Look up order:
+
+    1. Registered datasets with `register_dataset`
+    2. Dataset aliases defined in `DATASET_ALIASES`
+    3. Native dataset classes in `utilization.dataset.{dataset_name}`
+    """
+
+    if dataset_name in REGISTERY:
+        return [REGISTERY[dataset_name]]
+    elif dataset_name in DATASET_ALIASES:
         logger.info("Loading dataset aliases: %s -> %s", dataset_name, DATASET_ALIASES[dataset_name])
         return [_import_dataset_class(alias) for alias in DATASET_ALIASES[dataset_name]]
     else:
@@ -71,31 +88,43 @@ def get_subsets(
     dataset_name: str,
     dataset_classes: List[Type[Dataset]],
     args: "DatasetArguments",
-    offline: bool = False
+    cache_paths: List[Optional[str]],
 ) -> List[Set[str]]:
 
     available_subsets = set()
-    available_subsets_by_cls = []
+    available_subsets_by_cls: List[Set[str]] = []
 
-    for dataset_cls in dataset_classes:
+    for dataset_cls, cache_path in zip_longest(dataset_classes, cache_paths):
 
-        # dynamically set load_args for squad and wmt datasets, in order to support squad_v2 and wmt series datasets
+        if dataset_cls.load_args is None:
+            available_subsets_by_cls.append(set())
+            continue
+
+        # dynamically set load_args for wmt datasets, in order to support wmt series datasets
         if not dataset_cls.load_args:
             dataset_cls.load_args = (dataset_name,)
 
         download_config = DownloadConfig(use_etag=False)
+        paths = [cache_path, args.dataset_path, dataset_cls.load_args[0]]
         if args.dataset_path is not None:
-            paths = [args.dataset_path, dataset_cls.load_args[0]]
-        else:
-            paths = [dataset_cls.load_args[0]]
+            paths = [str(get_script_path(cache_path))] + paths
+        if cache_path is not None:
+            paths = [str(get_script_path(cache_path))] + paths
+
         found_config = False
         for path in paths:
+            if path is None:
+                continue
+
             try:
-                s = get_dataset_config_names(path, download_config=download_config, trust_remote_code=True)
+                s = get_dataset_config_names(path=path, download_config=download_config, trust_remote_code=True)
                 found_config = True
                 break
             except Exception as e:
-                logger.info(f"Failed when trying to get_dataset_config_names: {e}")
+                logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}")
+
+        logger.debug(f"get_dataset_config_names({path}): {s}")
+
         if not found_config:
             os.environ["HF_DATASETS_OFFLINE"] = "1"
             s = []
@@ -122,6 +151,7 @@ def get_subsets(
                 banned_subsets = {dataset_cls.banned_subsets}  # type: ignore
             else:
                 banned_subsets = set(dataset_cls.banned_subsets)
+            logger.debug(f"Removing banned subsets {banned_subsets} of {dataset_cls} from available subsets.")
             s -= banned_subsets
 
         available_subsets.update(s)
@@ -159,13 +189,15 @@ def load_dataset(
     dataset_name: str,
     args: "DatasetArguments",
     model: "Model",
-    threading: bool = True,
+    evaluation_args: "EvaluationArguments",
+    evaluation_data: Optional[List[Dict[str, Any]]] = None,
+    example_data: Optional[List[Dict[str, Any]]] = None,
 ) -> Iterator[Dict[str, Dataset]]:
     """Load corresponding dataset class. One dataset class contains one subset,
     e.g., Mmlu(abstract_algebra), Mmlu()
 
     1. Load dataset classes from dataset_name, e.g. `agieval` -> `Agieval_cot`
-    and `Agieval_single_choice`, `squad_v2` -> `Squad`
+    and `Agieval_single_choice`
     2. Get available subsets for each dataset class, e.g., `Agieval_cot` ->
     `['lsat-ar', ...]`, `Agieval_single_choice` -> `[logiqa-zh', ...]`
     3. Get subset names from command line arguments and get the intersection.
@@ -182,9 +214,23 @@ def load_dataset(
     """
 
     dataset_classes = import_dataset_classes(dataset_name)
-    available_subsets_by_cls = get_subsets(dataset_name, dataset_classes, args, offline=args.dataset_path is not None)
+    cache_paths = []
+    for dcls in dataset_classes:
+        if dcls.load_args is None:
+            continue
+        elif len(dcls.load_args) > 0:
+            cache_paths.append(huggingface_download(dcls.load_args[0], args.hfd_cache_path, mirror=args.hf_mirror))
+        else:
+            # dynamically set load_args for wmt datasets, in order to support wmt series datasets
+            cache_paths.append(huggingface_download(dataset_name, args.hfd_cache_path, mirror=args.hf_mirror))
+    available_subsets_by_cls = get_subsets(dataset_name, dataset_classes, args, cache_paths)
 
-    for dataset_cls, available_subsets in zip(dataset_classes, available_subsets_by_cls):
+    for dataset_cls, available_subsets, cache_path in zip_longest(
+        dataset_classes, available_subsets_by_cls, cache_paths
+    ):
+
+        if not args.passed_in_commandline("dataset_path"):
+            args.dataset_path = cache_path
 
         cmd_subset_names = get_cmd_subset_names(args.subset_names, dataset_cls)
         if len(args.subset_names) > 0 and len(cmd_subset_names) == 0:
@@ -215,12 +261,13 @@ def load_dataset(
                 subset_repr = ",".join(subset_names)
             logger.info("Loading dataset `%s` with subset(s): %s", dataset_name, subset_repr)
 
-            if threading and len(subset_names) > 2:
+            if evaluation_args.dataset_threading and len(subset_names) > 2:
 
                 # load the first dataset in the main thread (only show the INFO log message for the first dataset)
                 first_dataset = subset_names.pop(0)
                 first_dataset = (
-                    dataset_name + ":" + first_dataset, dataset_cls(dataset_name, args, model, first_dataset)
+                    dataset_name + ":" + first_dataset,
+                    dataset_cls(dataset_name, args, model, first_dataset, evaluation_data, example_data)
                 )
                 logger.info(f"Loading remaining subsets ...")
                 logging.disable(logging.INFO)
@@ -229,14 +276,20 @@ def load_dataset(
                 with ThreadPoolExecutor(max_workers=len(subset_names)) as executor:
                     res = [
                         executor.submit(
-                            lambda s: (dataset_name + ":" + s, dataset_cls(dataset_name, args, model, s)), s
+                            lambda s: (
+                                dataset_name + ":" + s,
+                                dataset_cls(dataset_name, args, model, s, evaluation_data, example_data)
+                            ), s
                         ) for s in subset_names
                     ]
                 datasets = dict([first_dataset] + [f.result() for f in as_completed(res)])
                 logging.disable(logging.NOTSET)
             else:
                 # load all datasets one by one
-                datasets = {dataset_name + ":" + s: dataset_cls(dataset_name, args, model, s) for s in subset_names}
+                datasets = {
+                    dataset_name + ":" + s: dataset_cls(dataset_name, args, model, s, evaluation_data, example_data)
+                    for s in subset_names
+                }
             yield datasets
 
         elif len(subset_names) == 1 and len(available_subsets) != 1 and accepts_subset(
@@ -247,16 +300,25 @@ def load_dataset(
             # race:middle (one of the subsets), coqa (default)
             subset_name = next(iter(subset_names))
             logger.info(f"Loading subset of dataset `{dataset_name}:{subset_name}`")
-            yield {dataset_name + ":" + subset_name: dataset_cls(dataset_name, args, model, subset_name)}
+            yield {
+                dataset_name + ":" + subset_name:
+                dataset_cls(dataset_name, args, model, subset_name, evaluation_data, example_data)
+            }
 
         else:
             # copa (super_glue:copa) or anli
             logger.info(f"Loading dataset `{dataset_name}`")
-            yield {dataset_name: dataset_cls(dataset_name, args, model)}
+            yield {dataset_name: dataset_cls(dataset_name, args, model, None, evaluation_data, example_data)}
 
 
 @catch_error()
-def load_datasets(args: "DatasetArguments", model: "Model") -> DatasetCollection:
+def load_datasets(
+    args: "DatasetArguments",
+    model: "Model",
+    evaluation_args: "EvaluationArguments",
+    evaluation_data: Optional[List[Dict[str, Any]]] = None,
+    example_data: Optional[List[Dict[str, Any]]] = None,
+) -> DatasetCollection:
 
     if model.model_backend == "vllm":
         args.batch_size = -1
@@ -271,7 +333,16 @@ def load_datasets(args: "DatasetArguments", model: "Model") -> DatasetCollection
     # get all the dataset classes
     datasets = []
     for d in args.dataset_names:
-        datasets.extend(load_dataset(d, args, model, args.dataset_threading))
+        datasets.extend(
+            load_dataset(
+                d,
+                args,
+                model,
+                evaluation_args,
+                evaluation_data=evaluation_data,
+                example_data=example_data,
+            )
+        )
     datasets = {k: v for d in datasets for k, v in d.items()}
     logger.debug(datasets)
     if len(datasets) <= 0:
diff --git a/utilization/model/load.py b/utilization/load_model.py
similarity index 82%
rename from utilization/model/load.py
rename to utilization/load_model.py
index 2c695f44..bcb53279 100644
--- a/utilization/model/load.py
+++ b/utilization/load_model.py
@@ -1,12 +1,12 @@
 from logging import getLogger
 from typing import TYPE_CHECKING
 
-from ..utils.catch_error import catch_error
+from .utils.catch_error import catch_error
 
 if TYPE_CHECKING:
     # solve the circular import
-    from ..utils import ModelArguments
     from .model import Model
+    from .utils import ModelArguments
 
 logger = getLogger(__name__)
 
@@ -23,22 +23,22 @@ def load_model(args: "ModelArguments") -> "Model":
     """
     if args.is_openai_model():
         logger.info(f"Loading OpenAI API model `{args.model_name_or_path}`.")
-        from .openai_model import Openai
+        from .model.openai_model import Openai
 
         return Openai(args)
     elif args.is_anthropic_model():
         logger.info(f"Loading Anthropic API model `{args.model_name_or_path}`.")
-        from .anthropic_model import Anthropic
+        from .model.anthropic_model import Anthropic
 
         return Anthropic(args)
     elif args.is_dashscope_model():
         logger.info(f"Loading Dashscope (Aliyun) API model `{args.model_name_or_path}`.")
-        from .dashscope_model import Dashscope
+        from .model.dashscope_model import Dashscope
 
         return Dashscope(args)
     elif args.is_qianfan_model():
         logger.info(f"Loading Qianfan (Baidu) API model `{args.model_name_or_path}`.")
-        from .qianfan_model import Qianfan
+        from .model.qianfan_model import Qianfan
 
         return Qianfan(args)
     else:
@@ -47,7 +47,7 @@ def load_model(args: "ModelArguments") -> "Model":
                 import vllm
                 vllm.__version__
 
-                from .vllm_model import vllmModel
+                from .model.vllm_model import vllmModel
 
                 return vllmModel(args)
             except ModuleNotFoundError:
@@ -63,6 +63,6 @@ def load_model(args: "ModelArguments") -> "Model":
                     raise ValueError(f"Set an appropriate tensor parallel size via CUDA_VISIBLE_DEVICES: {e}")
                 else:
                     raise e
-        from .huggingface_model import HuggingFaceModel
+        from .model.huggingface_model import HuggingFaceModel
 
         return HuggingFaceModel(args)
diff --git a/utilization/metric/__init__.py b/utilization/metric/__init__.py
index 50e468e1..45ac4c0a 100644
--- a/utilization/metric/__init__.py
+++ b/utilization/metric/__init__.py
@@ -8,7 +8,7 @@
     "Perspective_api", "Rouge", "Word_Accuracy"
 ]
 
-from .utils import avg_metrics
+from .metric_utils import avg_metrics
 
 if TYPE_CHECKING:
     from .accuracy import Accuracy as _Accuracy
diff --git a/utilization/metric/gpteval.py b/utilization/metric/gpteval.py
index b759cc9a..f4c0c413 100644
--- a/utilization/metric/gpteval.py
+++ b/utilization/metric/gpteval.py
@@ -7,8 +7,6 @@
 import openai
 from tqdm import tqdm
 
-from ..model import load_model
-from ..utils import ModelArguments
 from .metric import Metric
 
 logger = getLogger(__name__)
@@ -35,6 +33,8 @@ class GPTEval(Metric):
         """
 
     def __init__(self, multi_turn=False, type: Literal["single", "pairwise"] = "single"):
+        from ..utils import ModelArguments
+
         self.multi_turn = multi_turn
         self.type = type
         self.model_args = ModelArguments(
@@ -49,6 +49,8 @@ def __init__(self, multi_turn=False, type: Literal["single", "pairwise"] = "sing
     def __call__(self, predictions, references):
 
         # load gpteval model after the predictions of dataset are generated
+        from ..model import load_model
+
         self.model = load_model(self.model_args)
         self.model.set_generation_args()
 
diff --git a/utilization/metric/utils.py b/utilization/metric/metric_utils.py
similarity index 100%
rename from utilization/metric/utils.py
rename to utilization/metric/metric_utils.py
diff --git a/utilization/model/__init__.py b/utilization/model/__init__.py
index 817ebfd4..3b4d86eb 100644
--- a/utilization/model/__init__.py
+++ b/utilization/model/__init__.py
@@ -1 +1 @@
-from .load import load_model
+from .model import Model
diff --git a/utilization/model/huggingface_model.py b/utilization/model/huggingface_model.py
index 07c058c3..b9018d15 100644
--- a/utilization/model/huggingface_model.py
+++ b/utilization/model/huggingface_model.py
@@ -10,12 +10,11 @@
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
-from utilization.utils.conversation import Conversation
-from utilization.utils.prefix_caching import SequenceCache
-
 from ..utils import ModelArguments
 from .model import Model
-from .model_utils import KeyWordsCriteria
+from .model_utils.conversation import Conversation
+from .model_utils.keywords_criteria import KeyWordsCriteria
+from .model_utils.prefix_caching import SequenceCache
 
 logger = getLogger(__name__)
 
@@ -82,15 +81,21 @@ def load_hf_model(args: ModelArguments) -> Tuple[PreTrainedModel, Union[PreTrain
 
     # https://github.com/meta-llama/llama/issues/380#issuecomment-1656714118
     if args.torch_dtype == "auto":
-        with open(args.model_name_or_path + "/config.json") as f:
-            config = json.load(f)
-        if "torch_dtype" in config:
-            if config["torch_dtype"] == "float32":
-                args.torch_dtype = "float16"
-            else:
-                args.torch_dtype = config["torch_dtype"]
+        try:
+            with open(args.model_name_or_path + "/config.json") as f:
+                config = json.load(f)
+            if "torch_dtype" in config:
+                if config["torch_dtype"] == "float32":
+                    torch_dtype = "float16"
+                else:
+                    torch_dtype = config["torch_dtype"]
+        except:
+            torch_dtype = "float16"
+    else:
+        torch_dtype = args.torch_dtype
+
     model_kwargs = dict(
-        torch_dtype=getattr(torch, args.torch_dtype),
+        torch_dtype=getattr(torch, torch_dtype),
         device_map=args.device_map,
         load_in_4bit=args.load_in_4bit,
         load_in_8bit=args.load_in_8bit,
@@ -102,13 +107,15 @@ def load_hf_model(args: ModelArguments) -> Tuple[PreTrainedModel, Union[PreTrain
 
     if args.flash_attention:
         model_kwargs["attn_implementation"] = "flash_attention_2"
+    else:
+        model_kwargs["attn_implementation"] = "sdpa"
 
     if hasattr(args, 'bnb_config') and args.bnb_config:
         model_kwargs['quantization_config'] = args.bnb_config
 
     try:
         model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **model_kwargs).eval()
-    except (TypeError, ImportError, ValueError) as e:
+    except (TypeError, ImportError, ValueError, RuntimeError) as e:
         if "attn_implementation" in str(e) or "flash att" in str(e).lower().replace("_", " "):
             logger.warning(
                 f"Cannot set `attn_implementation` for {args.model_name_or_path}: {e}. Set `flash_attention` to False."
@@ -151,6 +158,8 @@ def __init__(self, args: ModelArguments):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model_max_input_and_output = self.tokenizer.model_max_length
 
+        # model tests
+
         try:
             self.model(position_ids=None)
         except TypeError:
@@ -167,6 +176,8 @@ def __init__(self, args: ModelArguments):
         except ValueError:
             self.support_cache = True
 
+        self.support_char_to_token = True
+
     @property
     def model_max_input(self):
         return self.tokenizer.model_max_length
diff --git a/utilization/model/model.py b/utilization/model/model.py
index bfe9bc89..7ae3f0b8 100644
--- a/utilization/model/model.py
+++ b/utilization/model/model.py
@@ -6,11 +6,11 @@
 from tiktoken import Encoding
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 
+from ..model_enum import API_MODELS, ENDPOINT_ARGS, ERROR_OVERVIEW
 from ..utils import ModelArguments
 from ..utils.arguments import ModelBackendMixin
-from ..utils.conversation import Conversation
-from ..utils.prefix_caching import Cacher
-from .model_enum import API_MODELS, ENDPOINT_ARGS, ERROR_OVERVIEW
+from .model_utils.conversation import Conversation
+from .model_utils.prefix_caching import Cacher
 
 if TYPE_CHECKING:
     # solve the circular import
@@ -228,11 +228,11 @@ def set_generation_args(self, **extra_model_args):
                 key = details.alias
 
             # type casting
-            if details._type is not None:
+            if details._type is not None and value is not None:
                 value = details._type(value)
 
             # transform
-            if details.transform is not None:
+            if details.transform is not None and value is not None:
                 value = details.transform(value)
 
             # skip if no value
diff --git a/utilization/model/model_utils/__init__.py b/utilization/model/model_utils/__init__.py
new file mode 100644
index 00000000..4171bc30
--- /dev/null
+++ b/utilization/model/model_utils/__init__.py
@@ -0,0 +1,4 @@
+from .batch_sampler import DatasetCollectionBatchSampler
+from .conversation import Conversation, ConversationFormatter
+from .keywords_criteria import KeyWordsCriteria
+from .prefix_caching import SequenceCache
diff --git a/utilization/utils/batch_sampler.py b/utilization/model/model_utils/batch_sampler.py
similarity index 98%
rename from utilization/utils/batch_sampler.py
rename to utilization/model/model_utils/batch_sampler.py
index 2d9a34f8..ef9ce331 100644
--- a/utilization/utils/batch_sampler.py
+++ b/utilization/model/model_utils/batch_sampler.py
@@ -7,7 +7,7 @@
 from .prefix_caching import CachePrefixSampler, round_down
 
 if TYPE_CHECKING:
-    from ..dataset.dataset import Dataset, DatasetCollection
+    from ...dataset.dataset import Dataset, DatasetCollection
 
 logger = getLogger(__name__)
 
@@ -165,7 +165,6 @@ def __iter__(self) -> Iterator[List[int]]:
         accumulative = 0
         for total, init_model, self._forward_call in zip(*self._splitted):
             iterator, total_prefix_num = init_model()
-            print(f"Total prefix num: {total_prefix_num}!!!")
             if total_prefix_num > 1 and model.support_cache:
                 sampler = CachePrefixSampler(
                     data=iterator,
diff --git a/utilization/utils/conversation.py b/utilization/model/model_utils/conversation.py
similarity index 96%
rename from utilization/utils/conversation.py
rename to utilization/model/model_utils/conversation.py
index 395dd86a..9e4ce91a 100644
--- a/utilization/utils/conversation.py
+++ b/utilization/model/model_utils/conversation.py
@@ -7,7 +7,7 @@
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 from transformers.pipelines.conversational import Conversation as _HFConversation
 
-from ..chat_templates import DEFAULT_CHAT_CONFIGS, DEFAULT_CHAT_TEMPLATE
+from ...chat_templates import DEFAULT_CHAT_CONFIGS, DEFAULT_CHAT_TEMPLATE, smart_space
 
 # legacy types
 NumOptions = NewType("NumOptions", int)
@@ -36,12 +36,7 @@ def _compile_jinja_template(chat_template):
     def raise_exception(message):
         raise TemplateError(message)
 
-    def smart_space(s: str, auto_leading_space: bool, context: str) -> str:
-        if auto_leading_space and s and context and not context[-1].isspace():
-            return ' ' + s
-        return s
-
-    jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+    jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=["jinja2.ext.do"])
     jinja_env.globals["raise_exception"] = raise_exception
     jinja_env.filters['smart_space'] = smart_space
     return jinja_env.from_string(chat_template)
@@ -127,8 +122,8 @@ def apply_prompt_template(
         *,
         add_generation_prompt: bool = False,
         remove_generation_prompt: bool = False,
-        final_lstrip: bool = False,
-        final_rstrip: bool = False,
+        final_lstrip: bool = True,
+        final_rstrip: bool = True,
     ) -> str:
 
         return self._apply_prompt_template(
@@ -237,11 +232,11 @@ def from_conversations(cls, conversations: List["Conversation"]) -> "Conversatio
         messages = [msg for conv in conversations for msg in conv.messages]
         return Conversation(messages=messages)
 
-    def set_num_options(self, num_options: int):
-        self.num_options = num_options
+    def set_num_options(self, num_options: Optional[int]):
+        self.num_options = num_options if isinstance(num_options, int) else 0
 
-    def set_num_shots(self, num_shots: int):
-        self.num_shots = num_shots
+    def set_num_shots(self, num_shots: Optional[int]):
+        self.num_shots = num_shots if isinstance(num_shots, int) else 0
 
     def add_multi_turn(self, *, users: Optional[List[str]] = None, assistant: Optional[str] = None):
         if users is not None and assistant is not None:
diff --git a/utilization/model/model_utils.py b/utilization/model/model_utils/keywords_criteria.py
similarity index 100%
rename from utilization/model/model_utils.py
rename to utilization/model/model_utils/keywords_criteria.py
diff --git a/utilization/utils/prefix_caching.py b/utilization/model/model_utils/prefix_caching.py
similarity index 99%
rename from utilization/utils/prefix_caching.py
rename to utilization/model/model_utils/prefix_caching.py
index 50a37394..51d157a0 100644
--- a/utilization/utils/prefix_caching.py
+++ b/utilization/model/model_utils/prefix_caching.py
@@ -5,7 +5,7 @@
 from torch.utils.data.sampler import Sampler
 from transformers import DynamicCache
 
-from utilization.utils.conversation import Conversation
+from .conversation import Conversation
 
 _LegacyCache = Tuple[Tuple[torch.FloatTensor, torch.FloatTensor], ...]
 
diff --git a/utilization/model/vllm_model.py b/utilization/model/vllm_model.py
index 6899c62b..1f4d786e 100644
--- a/utilization/model/vllm_model.py
+++ b/utilization/model/vllm_model.py
@@ -5,8 +5,8 @@
 import torch
 from packaging import version
 
-from ..utils.conversation import Conversation
 from .model import Model
+from .model_utils.conversation import Conversation
 
 if TYPE_CHECKING:
     from ..utils import ModelArguments
diff --git a/utilization/model/model_enum.py b/utilization/model_enum.py
similarity index 93%
rename from utilization/model/model_enum.py
rename to utilization/model_enum.py
index 5d156031..9fa0324c 100644
--- a/utilization/model/model_enum.py
+++ b/utilization/model_enum.py
@@ -1,6 +1,33 @@
 import re
 
-from utilization.model.generation_args import generation_arg
+from .utils.generation_args import generation_arg
+
+VLLM_ARGS = {
+    "temperature": generation_arg(),
+    "top_p": generation_arg(),
+    "top_k": generation_arg(),
+    "max_tokens": generation_arg(),
+    "best_of": generation_arg(),
+    "frequency_penalty": generation_arg(),
+    "presence_penalty": generation_arg(),
+    "repetition_penalty": generation_arg(),
+    "length_penalty": generation_arg(),
+    "early_stopping": generation_arg(),
+    "stop": generation_arg(),
+}
+
+HUGGINGFACE_ARGS = {
+    "temperature": generation_arg(),
+    "top_p": generation_arg(),
+    "top_k": generation_arg(),
+    "max_tokens": generation_arg(),
+    "best_of": generation_arg(),
+    "repetition_penalty": generation_arg(),
+    "length_penalty": generation_arg(),
+    "early_stopping": generation_arg(),
+    "no_repeat_ngram_size": generation_arg(),
+    "stop": generation_arg(),
+}
 
 ANTHROPIC_CHAT_COMPLETIONS_ARGS = {
     "max_tokens": generation_arg(default=4096),
@@ -119,6 +146,11 @@
         "model_type": "chat",
         "model_backend": "openai",
     },
+    "gpt-4o": {
+        "endpoint": "chat/completions",
+        "model_type": "chat",
+        "model_backend": "openai",
+    },
     "gpt-4-turbo": {
         "endpoint": "chat/completions",
         "model_type": "chat",
diff --git a/utilization/utils/arguments.py b/utilization/utils/arguments.py
index 9d8eb7c0..3ad1f2c5 100644
--- a/utilization/utils/arguments.py
+++ b/utilization/utils/arguments.py
@@ -10,14 +10,14 @@
 from typing import Callable, ClassVar, Dict, List, Literal, Optional, Set, Tuple, Union
 
 import tiktoken
-from transformers import BitsAndBytesConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.hf_argparser import HfArg, HfArgumentParser
 
 from ..chat_templates import DEFAULT_CHAT_CONFIGS
-from ..dataset.dataset_enum import DEFAULT_VLLM_DATASETS
-from ..model.model_enum import (
-    ANTHROPIC_CHAT_COMPLETIONS_ARGS, API_MODELS, DASHSCOPE_CHAT_COMPLETIONS_ARGS, QIANFAN_CHAT_COMPLETIONS_ARGS
+from ..dataset_enum import DEFAULT_VLLM_DATASETS
+from ..model_enum import (
+    ANTHROPIC_CHAT_COMPLETIONS_ARGS, API_MODELS, DASHSCOPE_CHAT_COMPLETIONS_ARGS, HUGGINGFACE_ARGS,
+    QIANFAN_CHAT_COMPLETIONS_ARGS, VLLM_ARGS
 )
+from .hf_argparser import HfArg, HfArgumentParser
 from .logging import filter_none_repr, get_redacted, list_datasets, log_levels, passed_in_commandline, set_logging
 
 logger = getLogger(__name__)
@@ -26,8 +26,12 @@
 
 if typing.TYPE_CHECKING:
     batch_size_type = int
+    from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 else:
     batch_size_type = str
+    PreTrainedModel = None
+    PreTrainedTokenizer = None
+    PreTrainedTokenizerFast = None
 
 LOADER = Callable[["ModelArguments"], Tuple["PreTrainedModel", Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"]]]
 
@@ -213,8 +217,14 @@ class ModelArguments(ModelBackendMixin):
         help="The maximum gpu memory utilization of vllm.",
     )
 
+    cuda_visible_devices: str = HfArg(
+        default="",
+        aliases=["--cuda"],
+        help="The CUDA device to use, e.g., '0' or '0,1,3'",
+    )
+
     torch_dtype: Literal["float16", "bfloat16", "float32", "auto"] = HfArg(
-        default="float16",
+        default="auto",
         help="The torch dtype for model input and output",
     )
 
@@ -237,11 +247,12 @@ class ModelArguments(ModelBackendMixin):
         "dashscope": {"dashscope_api_key"} | set(DASHSCOPE_CHAT_COMPLETIONS_ARGS),
         "openai": set(),  # openai model is used for gpt-eval metrics, not specific arguments
         "qianfan": {"qianfan_access_key", "qianfan_secret_key"} | set(QIANFAN_CHAT_COMPLETIONS_ARGS),
-        "vllm": {"vllm", "prefix_caching", "flash_attention", "gptq", "vllm_gpu_memory_utilization", "chat_template"},
+        "vllm": {"vllm", "prefix_caching", "flash_attention", "gptq", "vllm_gpu_memory_utilization", "chat_template"}
+        | set(VLLM_ARGS),
         "huggingface": {
             "device_map", "vllm", "prefix_caching", "flash_attention", "bnb_config", "load_in_8bit", "load_in_4bit",
-            "gptq", "chat_template", "stop"
-        },
+            "gptq", "chat_template"
+        } | set(HUGGINGFACE_ARGS),
     }
 
     def __post_init__(self):
@@ -269,7 +280,12 @@ def __post_init__(self):
                     "OpenAI API key is required. Please set it by passing a `--openai_api_key` or through environment variable `OPENAI_API_KEY`."
                 )
             if self.tokenizer_name_or_path is None:
-                self.tokenizer_name_or_path = tiktoken.encoding_name_for_model(self.model_name_or_path)
+                try:
+                    self.tokenizer_name_or_path = tiktoken.encoding_name_for_model(self.model_name_or_path)
+                except KeyError as e:
+                    raise RuntimeError(
+                        "Unsupported tiktoken library version. Please update the tiktoken library to the latest version.\n\n  pip install tiktoken --upgrade"
+                    )
 
         # set `self.anthropic_api_key` from environment variables
         if "ANTHROPIC_API_KEY" in os.environ and self.anthropic_api_key is None:
@@ -469,10 +485,17 @@ class DatasetArguments:
         default=False,
         help="Whether to shuffle the choices for ranking task",
     )
+    hf_mirror: bool = HfArg(
+        default=False,
+        help="Whether to use hfd.sh to load dataset from hugging face mirror server (experimental)",
+    )
+    hfd_cache_path: str = HfArg(
+        default="~/.cache/huggingface/datasets",
+        help="The cache path for datasets downloaded with hfd.sh",
+    )
 
     continue_from: ClassVar[int] = 0
     proxy_port: ClassVar[int] = None
-    dataset_threading: ClassVar[bool] = True
 
     # set in `set_logging` with format "{evaluation_results_dir}/{log_filename}.json"
     evaluation_results_path: ClassVar[Optional[str]] = None
@@ -541,11 +564,6 @@ class EvaluationArguments:
         default=None,
         help="The port of the proxy",
     )
-    cuda_visible_devices: Optional[str] = HfArg(
-        default=None,
-        aliases=["--cuda"],
-        help="Override the CUDA_VISIBLE_DEVICES environment variable",
-    )
     dataset_threading: bool = HfArg(default=True, help="Load dataset with threading")
     dataloader_workers: int = HfArg(default=0, help="The number of workers for dataloader")
     continue_from: Optional[str] = HfArg(
@@ -580,6 +598,16 @@ def check_args(model_args: ModelArguments, dataset_args: DatasetArguments, evalu
         dataset_args (DatasetArguments): The dataset configurations.
         evaluation_args (EvaluationArguments): The evaluation configurations.
     """
+    if model_args.cuda_visible_devices:
+        os.environ["CUDA_VISIBLE_DEVICES"] = model_args.cuda_visible_devices
+        import torch
+
+        if torch.cuda.device_count() != len(model_args.cuda_visible_devices.split(",")):
+            logger.warning(
+                f"CUDA initalized before setting CUDA_VISIBLE_DEVICES (most likely because of importing torch or transformers before parse_arguments). Ignoring --cuda flag."
+            )
+            os.environ.pop("CUDA_VISIBLE_DEVICES")
+
     # vllm still has some bugs in ranking task
     if model_args.is_local_model() and all(d not in DEFAULT_VLLM_DATASETS for d in dataset_args.dataset_names
                                            ) and not model_args.passed_in_commandline("vllm"):
@@ -590,8 +618,6 @@ def check_args(model_args: ModelArguments, dataset_args: DatasetArguments, evalu
     if evaluation_args.proxy_port:
         dataset_args.proxy_port = evaluation_args.proxy_port
 
-    dataset_args.dataset_threading = evaluation_args.dataset_threading
-
     model_args.seed = int(evaluation_args.seed)
 
     if dataset_args.batch_size == 1 and model_args.prefix_caching:
@@ -614,14 +640,6 @@ def check_args(model_args: ModelArguments, dataset_args: DatasetArguments, evalu
             f"Prefix caching might results in cuda memory fragmentation, which can be mitigated by setting `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`. See https://pytorch.org/docs/stable/notes/cuda.html#environment-variables for details."
         )
 
-    if evaluation_args.cuda_visible_devices:
-        if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"
-                                                               ] != evaluation_args.cuda_visible_devices:
-            logger.warning(
-                f"Override CUDA_VISIBLE_DEVICES from {os.environ['CUDA_VISIBLE_DEVICES']} to {evaluation_args.cuda_visible_devices}."
-            )
-        os.environ["CUDA_VISIBLE_DEVICES"] = evaluation_args.cuda_visible_devices
-
     # check dataset
     if "vicuna_bench" in dataset_args.dataset_names and model_args.openai_api_key is None:
         raise ValueError(
@@ -700,6 +718,7 @@ def parse_argument(args: Optional[List[str]] = None,
     model_args, dataset_args, evaluation_args = parser.parse_args_into_dataclasses(args)
 
     if model_args.bnb_config:
+        from transformers import BitsAndBytesConfig
         bnb_config_dict = json.loads(model_args.bnb_config)
         model_args.bnb_config = BitsAndBytesConfig(**bnb_config_dict)
 
diff --git a/utilization/utils/catch_error.py b/utilization/utils/catch_error.py
index fd5367bf..997785d1 100644
--- a/utilization/utils/catch_error.py
+++ b/utilization/utils/catch_error.py
@@ -1,17 +1,31 @@
+import inspect
 from functools import wraps
 from traceback import format_exc
 
 from .logging import getFileLogger
 
+UNSOPPORTED_LIBRARY = "Unsupported {lib} library version. Please update the {lib} library to the latest version.\n\n  pip install {lib} --upgrade"
+
 ERROR_OVERVIEW = {
     "probability tensor contains either `inf`, `nan` or element < 0":
     "probability tensor contains either `inf`, `nan` or element < 0.\nSee https://github.com/meta-llama/llama/issues/380 for more details.",
+    "'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte":
+    "Failed to fetch subset names from Hugging Face Hub. Please check your internet connection or try hf-mirror mode with `--hf_mirror` (experimental).",
+    "openai.types":
+    UNSOPPORTED_LIBRARY.format(lib="openai"),
     "trust_remote_code":
-    "Unsupported datasets library version. Please update the datasets library to the latest version.\n\n  pip install datasets --upgrade",
+    UNSOPPORTED_LIBRARY.format(lib="datasets"),
+    "datasets.exceptions.DatasetGenerationError":
+    "There is some issue when loading dataset with threading. Please try to disable threading with `--no_dataset_threading`.",
 }
 
 
 def catch_error(continue_from: bool = False):
+    """Catch the error and log the error message to log file. If the error is known, raise a RuntimeError with a message.
+
+    Args:
+        - continue_from (bool): Prompt the user to continue from the checkpoint if an error occurs.
+    """
 
     def catch_error_decrator(func):
         """Catch the error and log the error message to log file."""
diff --git a/utilization/model/generation_args.py b/utilization/utils/generation_args.py
similarity index 88%
rename from utilization/model/generation_args.py
rename to utilization/utils/generation_args.py
index 6ff62dcc..ef2aa247 100644
--- a/utilization/model/generation_args.py
+++ b/utilization/utils/generation_args.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Type, TypeVar, Union
+from typing import Any, Callable, Optional
 
 
 @dataclass
diff --git a/utilization/utils/hf_argparser.py b/utilization/utils/hf_argparser.py
new file mode 100644
index 00000000..f5671175
--- /dev/null
+++ b/utilization/utils/hf_argparser.py
@@ -0,0 +1,423 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import json
+import os
+import sys
+import types
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError
+from copy import copy
+from enum import Enum
+from inspect import isclass
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Literal, NewType, Optional, Tuple, Union, get_type_hints
+
+import yaml
+
+DataClass = NewType("DataClass", Any)
+DataClassType = NewType("DataClassType", Any)
+
+
+# From https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+def string_to_bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ArgumentTypeError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+
+
+def make_choice_type_function(choices: list) -> Callable[[str], Any]:
+    """
+    Creates a mapping function from each choices string representation to the actual value. Used to support multiple
+    value types for a single argument.
+
+    Args:
+        choices (list): List of choices.
+
+    Returns:
+        Callable[[str], Any]: Mapping function from string representation to actual value for each choice.
+    """
+    str_to_choice = {str(choice): choice for choice in choices}
+    return lambda arg: str_to_choice.get(arg, arg)
+
+
+def HfArg(
+    *,
+    aliases: Union[str, List[str]] = None,
+    help: str = None,
+    default: Any = dataclasses.MISSING,
+    default_factory: Callable[[], Any] = dataclasses.MISSING,
+    metadata: dict = None,
+    **kwargs,
+) -> dataclasses.Field:
+    """Argument helper enabling a concise syntax to create dataclass fields for parsing with `HfArgumentParser`.
+
+    Example comparing the use of `HfArg` and `dataclasses.field`:
+    ```
+    @dataclass
+    class Args:
+        regular_arg: str = dataclasses.field(default="Huggingface", metadata={"aliases": ["--example", "-e"], "help": "This syntax could be better!"})
+        hf_arg: str = HfArg(default="Huggingface", aliases=["--example", "-e"], help="What a nice syntax!")
+    ```
+
+    Args:
+        aliases (Union[str, List[str]], optional):
+            Single string or list of strings of aliases to pass on to argparse, e.g. `aliases=["--example", "-e"]`.
+            Defaults to None.
+        help (str, optional): Help string to pass on to argparse that can be displayed with --help. Defaults to None.
+        default (Any, optional):
+            Default value for the argument. If not default or default_factory is specified, the argument is required.
+            Defaults to dataclasses.MISSING.
+        default_factory (Callable[[], Any], optional):
+            The default_factory is a 0-argument function called to initialize a field's value. It is useful to provide
+            default values for mutable types, e.g. lists: `default_factory=list`. Mutually exclusive with `default=`.
+            Defaults to dataclasses.MISSING.
+        metadata (dict, optional): Further metadata to pass on to `dataclasses.field`. Defaults to None.
+
+    Returns:
+        Field: A `dataclasses.Field` with the desired properties.
+    """
+    if metadata is None:
+        # Important, don't use as default param in function signature because dict is mutable and shared across function calls
+        metadata = {}
+    if aliases is not None:
+        metadata["aliases"] = aliases
+    if help is not None:
+        metadata["help"] = help
+
+    return dataclasses.field(metadata=metadata, default=default, default_factory=default_factory, **kwargs)
+
+
+class HfArgumentParser(ArgumentParser):
+    """
+    This subclass of `argparse.ArgumentParser` uses type hints on dataclasses to generate arguments.
+
+    The class is designed to play well with the native argparse. In particular, you can add more (non-dataclass backed)
+    arguments to the parser after initialization and you'll get the output back after parsing as an additional
+    namespace. Optional: To create sub argument groups use the `_argument_group_name` attribute in the dataclass.
+    """
+
+    dataclass_types: Iterable[DataClassType]
+
+    def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]], **kwargs):
+        """
+        Args:
+            dataclass_types:
+                Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Passed to `argparse.ArgumentParser()` in the regular way.
+        """
+        # To make the default appear when using --help
+        if "formatter_class" not in kwargs:
+            kwargs["formatter_class"] = ArgumentDefaultsHelpFormatter
+        super().__init__(**kwargs)
+        if dataclasses.is_dataclass(dataclass_types):
+            dataclass_types = [dataclass_types]
+        self.dataclass_types = list(dataclass_types)
+        for dtype in self.dataclass_types:
+            self._add_dataclass_arguments(dtype)
+
+    @staticmethod
+    def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
+        field_name = f"--{field.name}"
+        kwargs = field.metadata.copy()
+        # field.metadata is not used at all by Data Classes,
+        # it is provided as a third-party extension mechanism.
+        if isinstance(field.type, str):
+            raise RuntimeError(
+                "Unresolved type detected, which should have been done with the help of "
+                "`typing.get_type_hints` method by default"
+            )
+
+        aliases = kwargs.pop("aliases", [])
+        if isinstance(aliases, str):
+            aliases = [aliases]
+
+        origin_type = getattr(field.type, "__origin__", field.type)
+        if origin_type is Union or (hasattr(types, "UnionType") and isinstance(origin_type, types.UnionType)):
+            if str not in field.type.__args__ and (
+                len(field.type.__args__) != 2 or type(None) not in field.type.__args__
+            ):
+                raise ValueError(
+                    "Only `Union[X, NoneType]` (i.e., `Optional[X]`) is allowed for `Union` because"
+                    " the argument parser only supports one type per argument."
+                    f" Problem encountered in field '{field.name}'."
+                )
+            if type(None) not in field.type.__args__:
+                # filter `str` in Union
+                field.type = field.type.__args__[0] if field.type.__args__[1] == str else field.type.__args__[1]
+                origin_type = getattr(field.type, "__origin__", field.type)
+            elif bool not in field.type.__args__:
+                # filter `NoneType` in Union (except for `Union[bool, NoneType]`)
+                field.type = (
+                    field.type.__args__[0] if isinstance(None, field.type.__args__[1]) else field.type.__args__[1]
+                )
+                origin_type = getattr(field.type, "__origin__", field.type)
+
+        # A variable to store kwargs for a boolean field, if needed
+        # so that we can init a `no_*` complement argument (see below)
+        bool_kwargs = {}
+        if origin_type is Literal or (isinstance(field.type, type) and issubclass(field.type, Enum)):
+            if origin_type is Literal:
+                kwargs["choices"] = field.type.__args__
+            else:
+                kwargs["choices"] = [x.value for x in field.type]
+
+            kwargs["type"] = make_choice_type_function(kwargs["choices"])
+
+            if field.default is not dataclasses.MISSING:
+                kwargs["default"] = field.default
+            else:
+                kwargs["required"] = True
+        elif field.type is bool or field.type == Optional[bool]:
+            # Copy the currect kwargs to use to instantiate a `no_*` complement argument below.
+            # We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
+            bool_kwargs = copy(kwargs)
+
+            # Hack because type=bool in argparse does not behave as we want.
+            kwargs["type"] = string_to_bool
+            if field.type is bool or (field.default is not None and field.default is not dataclasses.MISSING):
+                # Default value is False if we have no default when of type bool.
+                default = False if field.default is dataclasses.MISSING else field.default
+                # This is the value that will get picked if we don't include --field_name in any way
+                kwargs["default"] = default
+                # This tells argparse we accept 0 or 1 value after --field_name
+                kwargs["nargs"] = "?"
+                # This is the value that will get picked if we do --field_name (without value)
+                kwargs["const"] = True
+        elif isclass(origin_type) and issubclass(origin_type, list):
+            kwargs["type"] = field.type.__args__[0]
+            kwargs["nargs"] = "+"
+            if field.default_factory is not dataclasses.MISSING:
+                kwargs["default"] = field.default_factory()
+            elif field.default is dataclasses.MISSING:
+                kwargs["required"] = True
+        else:
+            kwargs["type"] = field.type
+            if field.default is not dataclasses.MISSING:
+                kwargs["default"] = field.default
+            elif field.default_factory is not dataclasses.MISSING:
+                kwargs["default"] = field.default_factory()
+            else:
+                kwargs["required"] = True
+        parser.add_argument(field_name, *aliases, **kwargs)
+
+        # Add a complement `no_*` argument for a boolean field AFTER the initial field has already been added.
+        # Order is important for arguments with the same destination!
+        # We use a copy of earlier kwargs because the original kwargs have changed a lot before reaching down
+        # here and we do not need those changes/additional keys.
+        if field.default is True and (field.type is bool or field.type == Optional[bool]):
+            bool_kwargs["default"] = False
+            parser.add_argument(f"--no_{field.name}", action="store_false", dest=field.name, **bool_kwargs)
+
+    def _add_dataclass_arguments(self, dtype: DataClassType):
+        if hasattr(dtype, "_argument_group_name"):
+            parser = self.add_argument_group(dtype._argument_group_name)
+        else:
+            parser = self
+
+        try:
+            type_hints: Dict[str, type] = get_type_hints(dtype)
+        except NameError:
+            raise RuntimeError(
+                f"Type resolution failed for {dtype}. Try declaring the class in global scope or "
+                "removing line of `from __future__ import annotations` which opts in Postponed "
+                "Evaluation of Annotations (PEP 563)"
+            )
+        except TypeError as ex:
+            # Remove this block when we drop Python 3.9 support
+            if sys.version_info[:2] < (3, 10) and "unsupported operand type(s) for |" in str(ex):
+                python_version = ".".join(map(str, sys.version_info[:3]))
+                raise RuntimeError(
+                    f"Type resolution failed for {dtype} on Python {python_version}. Try removing "
+                    "line of `from __future__ import annotations` which opts in union types as "
+                    "`X | Y` (PEP 604) via Postponed Evaluation of Annotations (PEP 563). To "
+                    "support Python versions that lower than 3.10, you need to use "
+                    "`typing.Union[X, Y]` instead of `X | Y` and `typing.Optional[X]` instead of "
+                    "`X | None`."
+                ) from ex
+            raise
+
+        for field in dataclasses.fields(dtype):
+            if not field.init:
+                continue
+            field.type = type_hints[field.name]
+            self._parse_dataclass_field(parser, field)
+
+    def parse_args_into_dataclasses(
+        self,
+        args=None,
+        return_remaining_strings=False,
+        look_for_args_file=True,
+        args_filename=None,
+        args_file_flag=None,
+    ) -> Tuple[DataClass, ...]:
+        """
+        Parse command-line args into instances of the specified dataclass types.
+
+        This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at:
+        docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args
+
+        Args:
+            args:
+                List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
+            return_remaining_strings:
+                If true, also return a list of remaining argument strings.
+            look_for_args_file:
+                If true, will look for a ".args" file with the same base name as the entry point script for this
+                process, and will append its potential content to the command line args.
+            args_filename:
+                If not None, will uses this file instead of the ".args" file specified in the previous argument.
+            args_file_flag:
+                If not None, will look for a file in the command-line args specified with this flag. The flag can be
+                specified multiple times and precedence is determined by the order (last one wins).
+
+        Returns:
+            Tuple consisting of:
+
+                - the dataclass instances in the same order as they were passed to the initializer.abspath
+                - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser
+                  after initialization.
+                - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args)
+        """
+
+        if args_file_flag or args_filename or (look_for_args_file and len(sys.argv)):
+            args_files = []
+
+            if args_filename:
+                args_files.append(Path(args_filename))
+            elif look_for_args_file and len(sys.argv):
+                args_files.append(Path(sys.argv[0]).with_suffix(".args"))
+
+            # args files specified via command line flag should overwrite default args files so we add them last
+            if args_file_flag:
+                # Create special parser just to extract the args_file_flag values
+                args_file_parser = ArgumentParser()
+                args_file_parser.add_argument(args_file_flag, type=str, action="append")
+
+                # Use only remaining args for further parsing (remove the args_file_flag)
+                cfg, args = args_file_parser.parse_known_args(args=args)
+                cmd_args_file_paths = vars(cfg).get(args_file_flag.lstrip("-"), None)
+
+                if cmd_args_file_paths:
+                    args_files.extend([Path(p) for p in cmd_args_file_paths])
+
+            file_args = []
+            for args_file in args_files:
+                if args_file.exists():
+                    file_args += args_file.read_text().split()
+
+            # in case of duplicate arguments the last one has precedence
+            # args specified via the command line should overwrite args from files, so we add them last
+            args = file_args + args if args is not None else file_args + sys.argv[1:]
+        namespace, remaining_args = self.parse_known_args(args=args)
+        outputs = []
+        for dtype in self.dataclass_types:
+            keys = {f.name for f in dataclasses.fields(dtype) if f.init}
+            inputs = {k: v for k, v in vars(namespace).items() if k in keys}
+            for k in keys:
+                delattr(namespace, k)
+            obj = dtype(**inputs)
+            outputs.append(obj)
+        if len(namespace.__dict__) > 0:
+            # additional namespace.
+            outputs.append(namespace)
+        if return_remaining_strings:
+            return (*outputs, remaining_args)
+        else:
+            if remaining_args:
+                raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {remaining_args}")
+
+            return (*outputs,)
+
+    def parse_dict(self, args: Dict[str, Any], allow_extra_keys: bool = False) -> Tuple[DataClass, ...]:
+        """
+        Alternative helper method that does not use `argparse` at all, instead uses a dict and populating the dataclass
+        types.
+
+        Args:
+            args (`dict`):
+                dict containing config values
+            allow_extra_keys (`bool`, *optional*, defaults to `False`):
+                Defaults to False. If False, will raise an exception if the dict contains keys that are not parsed.
+
+        Returns:
+            Tuple consisting of:
+
+                - the dataclass instances in the same order as they were passed to the initializer.
+        """
+        unused_keys = set(args.keys())
+        outputs = []
+        for dtype in self.dataclass_types:
+            keys = {f.name for f in dataclasses.fields(dtype) if f.init}
+            inputs = {k: v for k, v in args.items() if k in keys}
+            unused_keys.difference_update(inputs.keys())
+            obj = dtype(**inputs)
+            outputs.append(obj)
+        if not allow_extra_keys and unused_keys:
+            raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}")
+        return tuple(outputs)
+
+    def parse_json_file(self,
+                        json_file: Union[str, os.PathLike],
+                        allow_extra_keys: bool = False) -> Tuple[DataClass, ...]:
+        """
+        Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
+        dataclass types.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                File name of the json file to parse
+            allow_extra_keys (`bool`, *optional*, defaults to `False`):
+                Defaults to False. If False, will raise an exception if the json file contains keys that are not
+                parsed.
+
+        Returns:
+            Tuple consisting of:
+
+                - the dataclass instances in the same order as they were passed to the initializer.
+        """
+        with open(Path(json_file), encoding="utf-8") as open_json_file:
+            data = json.loads(open_json_file.read())
+        outputs = self.parse_dict(data, allow_extra_keys=allow_extra_keys)
+        return tuple(outputs)
+
+    def parse_yaml_file(self,
+                        yaml_file: Union[str, os.PathLike],
+                        allow_extra_keys: bool = False) -> Tuple[DataClass, ...]:
+        """
+        Alternative helper method that does not use `argparse` at all, instead loading a yaml file and populating the
+        dataclass types.
+
+        Args:
+            yaml_file (`str` or `os.PathLike`):
+                File name of the yaml file to parse
+            allow_extra_keys (`bool`, *optional*, defaults to `False`):
+                Defaults to False. If False, will raise an exception if the json file contains keys that are not
+                parsed.
+
+        Returns:
+            Tuple consisting of:
+
+                - the dataclass instances in the same order as they were passed to the initializer.
+        """
+        outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
+        return tuple(outputs)
diff --git a/utilization/utils/hfd.py b/utilization/utils/hfd.py
new file mode 100644
index 00000000..68b1eb8a
--- /dev/null
+++ b/utilization/utils/hfd.py
@@ -0,0 +1,68 @@
+import os
+from logging import getLogger
+from pathlib import Path
+from typing import Optional, Union
+
+logger = getLogger(__name__)
+
+
+def get_script_path(repo_path: Union[Path, str]) -> Path:
+    if isinstance(repo_path, Path):
+        repo_path = str(repo_path)
+
+    load_script_name = repo_path.split("/")[-1].split("--")[-1] + ".py"
+    return Path(repo_path) / load_script_name
+
+
+def update_script(load_script_path: Path, mirror: bool, old: str, new: str) -> bool:
+    if not load_script_path.exists():
+        return False
+
+    # if HF_ENDPOINT is set in the environment, update the load script
+    hf_endpoint = os.environ.get("HF_ENDPOINT", "").rstrip("/")
+    new_endpoint = new.rstrip("/")
+
+    if mirror or hf_endpoint == new_endpoint:
+        script = load_script_path.read_text()
+        updated_script = script.replace(old, new)
+        if script != updated_script:
+            load_script_path.with_suffix(".py.bak").write_text(script)
+            load_script_path.write_text(updated_script)
+            return True
+
+    return False
+
+
+def huggingface_download(
+    path: str,
+    hfd_cache_path: str,
+    mirror: bool = True,
+    old: str = "https://huggingface.co",
+    new: str = "https://hf-mirror.com",
+) -> Optional[str]:
+    """Download a dataset from Hugging Face Hub to a local directory using hfd.sh."""
+
+    hub_cache_path = Path(hfd_cache_path).expanduser()
+    repo_name = "datasets--" + path.replace("/", "--")
+    repo_path = hub_cache_path / repo_name
+    load_script_path = get_script_path(repo_path)
+
+    if repo_path.exists() and load_script_path.exists():
+        update_script(load_script_path, mirror, old, new)
+        logger.debug(f"Found {repo_path}, skipping download")
+        return str(repo_path)
+
+    if os.name != "posix":
+        logger.warning("hfd.sh is only supported on Unix-like systems.")
+        return None
+
+    hfd_cli = Path(__file__).parent.parent / "hfd.sh"
+    logger.debug(f"Downloading {path} to {repo_path}")
+
+    mirror_flag = " --mirror" if mirror else ""
+    command = f"bash {hfd_cli.as_posix()} {path} --dataset --local-dir {repo_path.as_posix()}"
+    os.system(command + mirror_flag)
+
+    update_script(load_script_path, mirror, old, new)
+
+    return str(repo_path)
diff --git a/utilization/utils/log_results.py b/utilization/utils/log_results.py
index 010fe325..481619c2 100644
--- a/utilization/utils/log_results.py
+++ b/utilization/utils/log_results.py
@@ -6,8 +6,6 @@
 
 import pandas as pd
 
-from .conversation import Conversation
-
 logger = getLogger(__name__)
 
 if typing.TYPE_CHECKING:
@@ -38,6 +36,8 @@ def wrapper(df: pd.DataFrame):
 
 
 def dump_conversations(convs: List[Any], local: bool):
+    from ..model.model_utils.conversation import Conversation
+
     if isinstance(convs, (str, Conversation)):
         convs = [convs]
 
diff --git a/utilization/utils/logging.py b/utilization/utils/logging.py
index 3c98f222..b73839de 100644
--- a/utilization/utils/logging.py
+++ b/utilization/utils/logging.py
@@ -4,11 +4,12 @@
 import pathlib
 import sys
 from dataclasses import fields
+from functools import lru_cache
 from typing import TYPE_CHECKING, List, Optional, Set
 
 import coloredlogs
 
-from ..dataset.dataset_enum import DATASET_ALIASES
+from ..dataset_enum import DATASET_ALIASES
 
 if TYPE_CHECKING:
     # solve the circular import
@@ -37,7 +38,10 @@
 }
 
 
+@lru_cache
 def list_datasets() -> List[str]:
+    """List all natively supported datasets."""
+
     results = os.listdir(os.path.join(os.path.dirname(__file__), "../dataset"))
     results = [f[:-3] for f in results if f.endswith(".py")]
     results = [f for f in results if f not in BUILTIN_DATASET]
diff --git a/utilization/utils/random.py b/utilization/utils/random.py
index 3dcb6381..95df2971 100644
--- a/utilization/utils/random.py
+++ b/utilization/utils/random.py
@@ -1,16 +1,6 @@
 import random
 
 import numpy as np
-import torch
-
-try:
-    import accelerate
-    from accelerate.utils.imports import is_mlu_available, is_npu_available, is_torch_xla_available, is_xpu_available
-
-    if is_torch_xla_available():
-        import torch_xla.core.xla_model as xm
-except (ModuleNotFoundError, ImportError):
-    accelerate = None
 
 
 def set_seed(seed: int, device_specific: bool = False, deterministic: bool = False):
@@ -25,6 +15,19 @@ def set_seed(seed: int, device_specific: bool = False, deterministic: bool = Fal
         deterministic (`bool`, *optional*, defaults to `False`):
             Whether to use deterministic algorithms where available. Can slow down training.
     """
+    import torch
+
+    try:
+        import accelerate
+        from accelerate.utils.imports import (
+            is_mlu_available, is_npu_available, is_torch_xla_available, is_xpu_available
+        )
+
+        if is_torch_xla_available():
+            import torch_xla.core.xla_model as xm
+    except (ModuleNotFoundError, ImportError):
+        accelerate = None
+
     if device_specific and accelerate is not None:
         seed += accelerate.state.AcceleratorState().process_index