Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OSPP: Implementation of Domain-Specific Large Model Benchmarking Based on KubeEdge-Ianvs #144

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions core/common/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class DatasetFormat(Enum):
CSV = "csv"
TXT = "txt"
JSON = "json"
JSONL = "jsonl"


class ParadigmType(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,8 @@ def _inference(self, job, trained_model):
inference_output_dir = os.path.join(self.workspace, "output/inference/")
os.environ["RESULT_SAVED_URL"] = inference_output_dir
job.load(trained_model)
infer_res = job.predict(inference_dataset.x)
if hasattr(inference_dataset, 'need_other_info'):
infer_res = job.predict(inference_dataset)
else:
infer_res = job.predict(inference_dataset.x)
return infer_res
77 changes: 69 additions & 8 deletions core/testenvmanager/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,16 @@

import os
import tempfile

import pandas as pd
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse

# pylint: disable=no-name-in-module
# pylint: disable=too-many-instance-attributes
from sedna.datasources import (
CSVDataParse,
TxtDataParse,
JSONDataParse,
JsonlDataParse,
JSONMetaDataParse,
)
from core.common import utils
from core.common.constant import DatasetFormat

Expand All @@ -38,12 +44,28 @@ class Dataset:
def __init__(self, config):
self.train_url: str = ""
self.test_url: str = ""
self.train_index: str = ""
self.test_index: str = ""
self.train_data: str = ""
self.test_data: str = ""
self.train_data_info: str = ""
self.test_data_info: str = ""
self.label: str = ""
self._parse_config(config)

def _check_fields(self):
self._check_dataset_url(self.train_url)
self._check_dataset_url(self.test_url)
if self.train_index:
self._check_dataset_url(self.train_index)
if self.test_index:
self._check_dataset_url(self.test_index)
if self.train_data:
self._check_dataset_url(self.train_data)
if self.test_data:
self._check_dataset_url(self.test_data)
if self.train_data_info:
self._check_dataset_url(self.train_data_info)
if self.test_data_info:
self._check_dataset_url(self.test_data_info)

def _parse_config(self, config):
for attr, value in config.items():
Expand Down Expand Up @@ -108,6 +130,20 @@ def _process_index_file(self, file_url):

return None

def _process_data_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSONL.value:
return file_url

return None

def _process_data_info_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSON.value:
return file_url

return None

def process_dataset(self):
"""
process dataset:
Expand All @@ -116,9 +152,26 @@ def process_dataset(self):
in the index file(e.g.: txt index file).

"""
if self.train_index:
self.train_url = self._process_index_file(self.train_index)
elif self.train_data:
self.train_url = self._process_data_file(self.train_data)
elif self.train_data_info:
self.train_url = self._process_data_info_file(self.train_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of train_index/train_data/train_data_info')

if self.test_index:
self.test_url = self._process_index_file(self.test_index)
elif self.test_data:
self.test_url = self._process_data_file(self.test_data)
elif self.test_data_info:
self.test_url = self._process_data_info_file(self.test_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of test_index/test_data/test_data_info')

self.train_url = self._process_index_file(self.train_url)
self.test_url = self._process_index_file(self.test_url)

# pylint: disable=too-many-arguments
def split_dataset(
Expand Down Expand Up @@ -514,6 +567,11 @@ def load_data(
e.g.: TxtDataParse, CSVDataParse.

"""
if file.split('/')[-1] == "metadata.json":
data = JSONMetaDataParse(data_type=data_type, func=feature_process)
data.parse(file)
return data

data_format = utils.get_file_format(file)

data = None
Expand All @@ -523,11 +581,14 @@ def load_data(

if data_format == DatasetFormat.TXT.value:
data = TxtDataParse(data_type=data_type, func=feature_process)
# print(file)
data.parse(file, use_raw=use_raw)

if data_format == DatasetFormat.JSON.value:
data = JSONDataParse(data_type=data_type, func=feature_process)
data.parse(file)

if data_format == DatasetFormat.JSONL.value:
data = JsonlDataParse(data_type=data_type, func=feature_process)
data.parse(file)

return data
104 changes: 104 additions & 0 deletions examples/government/singletask_learning_bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Government BenchMark

## Introduction

This is the work for Domain-specific Large Model Benchmark:

Constructs a suite for the government sector, including test datasets, evaluation metrics, testing environments, and usage guidelines.

This Benchmark consists of two parts: subjective evaluation data and objective evaluation data.

## Design

### Metadata Format

| Name | Field Name | Option | Description |
| --- | --- | --- | --- |
| Data Name | dataset | Required | Name of the dataset |
| Data Description | description | Optional | Dataset description, such as usage scope, sample size, etc. |
| First-level Dimension | level_1_dim | Required | Should fill in "Single Modal" or "Multi-Modal" |
| Second-level Dimension | level_2_dim | Required | For "Single Modal", fill in "Text", "Image", or "Audio". For "Multi-Modal", fill in "Text-Image", "Text-Audio", "Image-Audio", or "Text-Image-Audio" |
| Third-level Dimension | level_3_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |
| Fourth-level Dimension | level_4_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |

metadata example:

```json
{
"dataset": "Medical BenchMark",
"description": "xxx",
"level_1_dim": "single-modal",
"level_2_dim": "text",
"level_3_dim": "Q&A",
"level_4_dim": "medical"
}
```

### Data format:

|name|Option|information|
|---|---|---|
|prompt|Optional|the background of the LLM testing|
|query|Required|the testing question|
|response|Required|the answer of the question|
|explanation|Optional|the explanation of the answer|
|judge_prompt|Optional|the prompt of the judge model|
|level_1_dim|Optional|single-modal or multi-modal|
|level_2_dim|Optional|single-modal: text, image, video; multi-modal: text-image, text-video, text-image-video|
|level_3_dim|Required|details|
|level_4_dim|Required|details|

data example:

```json
{
"prompt": "Please think step by step and answer the question.",
"question": "Which one is the correct answer of xxx? A. xxx B. xxx C. xxx D. xxx",
"response": "C",
"explanation": "xxx",
"level_1_dim": "single-modal",
"level_2_dim": "text",
"level_3_dim": "knowledge Q&A",
"level_4_dim": "medical knowledge"
}
```


## Change to Core Code

![](./imgs/structure.png)

## Prepare Datasets

You can download dataset in [kaggle](https://www.kaggle.com/datasets/kubeedgeianvs/the-government-affairs-dataset-govaff/data?select=government_benchmark)

```
dataset/government
├── objective
│ ├── test_data
│ │ ├── data.jsonl
│ │ └── metadata.json
│ └── train_data
└── subjective
├── test_data
│ ├── data_full.jsonl
│ ├── data.jsonl
│ └── metadata.json
└── train_data
```

## Prepare Environment

You should change your sedna package like this: [my sedna repo commit](https://github.com/IcyFeather233/sedna/commit/e13b82363c03dc771fca4922a24798554ca32a9f)

Or you can replace the file in `yourpath/anaconda3/envs/ianvs/lib/python3.x/site-packages/sedna` with `examples/resources/sedna-llm.zip`

## Run Ianvs

### Objective

`ianvs -f examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml`

### Subjective

`ianvs -f examples/government/singletask_learning_bench/subjective/benchmarkingjob.yaml`
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
benchmarkingjob:
# job name of bechmarking; string type;
name: "benchmarkingjob"
# the url address of job workspace that will reserve the output of tests; string type;
workspace: "/home/icyfeather/project/ianvs/workspace"

# the url address of test environment configuration file; string type;
# the file format supports yaml/yml;
testenv: "./examples/government/singletask_learning_bench/objective/testenv/testenv.yaml"

# the configuration of test object
test_object:
# test type; string type;
# currently the option of value is "algorithms",the others will be added in succession.
type: "algorithms"
# test algorithm configuration files; list type;
algorithms:
# algorithm name; string type;
- name: "politic_bench_singletask_learning"
# the url address of test algorithm configuration file; string type;
# the file format supports yaml/yml;
url: "./examples/government/singletask_learning_bench/objective/testalgorithms/gen/gen_algorithm.yaml"

# the configuration of ranking leaderboard
rank:
# rank leaderboard with metric of test case's evaluation and order ; list type;
# the sorting priority is based on the sequence of metrics in the list from front to back;
sort_by: [ { "acc": "descend" } ]

# visualization configuration
visualization:
# mode of visualization in the leaderboard; string type;
# There are quite a few possible dataitems in the leaderboard. Not all of them can be shown simultaneously on the screen.
# In the leaderboard, we provide the "selected_only" mode for the user to configure what is shown or is not shown.
mode: "selected_only"
# method of visualization for selected dataitems; string type;
# currently the options of value are as follows:
# 1> "print_table": print selected dataitems;
method: "print_table"

# selected dataitem configuration
# The user can add his/her interested dataitems in terms of "paradigms", "modules", "hyperparameters" and "metrics",
# so that the selected columns will be shown.
selected_dataitem:
# currently the options of value are as follows:
# 1> "all": select all paradigms in the leaderboard;
# 2> paradigms in the leaderboard, e.g., "singletasklearning"
paradigms: [ "all" ]
# currently the options of value are as follows:
# 1> "all": select all modules in the leaderboard;
# 2> modules in the leaderboard, e.g., "basemodel"
modules: [ "all" ]
# currently the options of value are as follows:
# 1> "all": select all hyperparameters in the leaderboard;
# 2> hyperparameters in the leaderboard, e.g., "momentum"
hyperparameters: [ "all" ]
# currently the options of value are as follows:
# 1> "all": select all metrics in the leaderboard;
# 2> metrics in the leaderboard, e.g., "f1_score"
metrics: [ "acc" ]

# model of save selected and all dataitems in workspace; string type;
# currently the options of value are as follows:
# 1> "selected_and_all": save selected and all dataitems;
# 2> "selected_only": save selected dataitems;
save_mode: "selected_and_all"






Loading
Loading