Skip to content

Commit

Permalink
change model to Qwen2
Browse files Browse the repository at this point in the history
  • Loading branch information
l-bat committed Dec 18, 2024
1 parent 8831223 commit eefe4f2
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 63 deletions.
83 changes: 40 additions & 43 deletions tests/python_tests/test_cache_optimizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class CacheOptTestStruct:


SHORT_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=32, max_cache_size=96, aggregation_mode=AggregationMode.NORM_SUM)
LONGBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=64, max_cache_size=256, aggregation_mode=AggregationMode.NORM_SUM)
LONGBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=128, max_cache_size=672, aggregation_mode=AggregationMode.NORM_SUM)

@pytest.mark.precommit
@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="doesn't work on win due to optimum-intel export bug, segfault on mac")
Expand Down Expand Up @@ -148,84 +148,81 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t


@pytest.fixture(scope='module')
def phi3_converted_model(tmp_path_factory):
model_id = "meta-llama/Llama-3.2-3B-Instruct"
def qwen2_converted_model(tmp_path_factory):
model_id = "Qwen/Qwen2-0.5B-Instruct"
model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
model.save_pretrained(models_path)
ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True)
serialize(ov_tokenizer, models_path / "openvino_tokenizer.xml")
serialize(ov_detokenizer, models_path / "openvino_detokenizer.xml")
phi3_converted_model = ConvertedModel(model, tokenizer, models_path)
yield phi3_converted_model
del phi3_converted_model
qwen2_converted_model = ConvertedModel(model, tokenizer, models_path)
yield qwen2_converted_model
del qwen2_converted_model
del model


@pytest.mark.precommit
@pytest.mark.parametrize("subset", ["samsum", "qmsum", "trec", "qasper", "hotpotqa", "repobench-p"])
def test_unoptimized_generation_longbench(phi3_converted_model, subset):
@dataclass
class LongBenchTestData:
subset: str
ref_score: float
max_cache_usage: float
avg_cache_usage: float


@pytest.mark.parametrize("test_struct", [
LongBenchTestData("samsum", 34.96, 16.2, 8.145),
LongBenchTestData("trec", 35, 14, 7.284),
LongBenchTestData("qasper", 14.67, 22.8, 13.182),
])
def test_unoptimized_generation_longbench(qwen2_converted_model, test_struct):
seqs_per_request = 32
num_kv_blocks = 1000
scheduler_config = get_scheduler_config(num_kv_blocks)
models_path = phi3_converted_model.models_path
models_path = qwen2_converted_model.models_path
model_name = "/".join(models_path.parts[-2:])
subset = test_struct.subset
max_new_tokens = dataset2maxlen[subset]
tokenizer = phi3_converted_model.tokenizer
tokenizer = qwen2_converted_model.tokenizer

generation_config = GenerationConfig() # expecting default greedy sampling
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = max_new_tokens
generation_config.eos_token_id = tokenizer.eos_token_id

data = datasets.load_dataset('THUDM/LongBench', subset, split='test')
scheduler_config.use_cache_eviction = True
if scheduler_config.use_cache_eviction:
scheduler_config.cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG

# model_id = "microsoft/Phi-3-mini-4k-instruct"
# model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False)
model_cb_opt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {})
data = datasets.load_dataset('THUDM/LongBench', subset, split='test')

model_cb_noopt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {})
with tqdm(total=len(data)) as progress_bar:
batch = []
answers = []
for p_idx, data_sample in enumerate(data):
prompt, context_len = preprocess_prompt(tokenizer, data_sample, subset, model_name)
prompt = preprocess_prompt(data_sample, subset, model_name)
progress_bar.update(1)
batch.append(prompt)
answers.append({"context_len": context_len, "answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})

# input = tokenizer(prompt, truncation=False, return_tensors="pt")
# output = model.generate(
# **input,
# max_new_tokens=128,
# num_beams=1,
# do_sample=False,
# temperature=1.0,
# min_length=context_len+1,
# pad_token_id=tokenizer.eos_token_id,
# eos_token_id=[tokenizer.eos_token_id, tokenizer.encode("\n", add_special_tokens=False)[-1]],
# )[0]
# pred = tokenizer.decode(output[context_len:], skip_special_tokens=True)
# pred = post_process_pred(output.m_generation_ids, subset, model_name)
# answers[-1]["pred"] = pred

if (
len(batch) == seqs_per_request
or p_idx == len(data) - 1
):
ans_batch = model_cb_noopt.generate(
answers.append({"answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})

if len(batch) == seqs_per_request or p_idx == len(data) - 1:
ans_batch = model_cb_opt.generate(
batch, [generation_config] * len(batch)
)
for i, output in enumerate(ans_batch, start=p_idx-len(batch)+1):
context_len = answers[i]["context_len"]
pred = post_process_pred(output.m_generation_ids[0], subset, model_name)
answers[i]["pred"] = pred

batch.clear()

score = evaluate(answers, subset)
print(f"Score: {score}")

pipeline_noopt_metrics = model_cb_noopt.get_metrics()
print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
del model_cb_noopt
pipeline_noopt_metrics = model_cb_opt.get_metrics()
print(f"Opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")

assert abs(test_struct.ref_score - score) < 1
assert abs(test_struct.max_cache_usage - pipeline_noopt_metrics.max_cache_usage) < 1
assert abs(test_struct.avg_cache_usage - pipeline_noopt_metrics.avg_cache_usage) < 1
del model_cb_opt
24 changes: 4 additions & 20 deletions tests/python_tests/utils_longbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,6 @@ def qa_f1_score(prediction, ground_truth, **kwargs):
"repobench-p": code_sim_score,
}

# Max length for NVIDIA GeForce RTX 3090 (24 GB)
model2maxlen = {
"meta-llama/Llama-2-7b-chat-hf": 4096,
"meta-llama/Meta-Llama-3-8B-Instruct": 5000,
"meta-llama/Llama-3.1-8B-Instruct": 10000,
"microsoft/Phi-3-mini-4k-instruct": 4096,
'meta-llama/Llama-3.2-1B-Instruct': 10000,
'meta-llama/Llama-3.2-3B-Instruct': 10000,
}

dataset2maxlen = {
"narrativeqa": 128,
"qasper": 128,
Expand Down Expand Up @@ -235,20 +225,12 @@ def build_chat(prompt, model_name):
return prompt


def preprocess_prompt(tokenizer, data_sample, subset, model_name):
def preprocess_prompt(data_sample, subset, model_name):
prompt_format = dataset2prompt[subset]
max_length = model2maxlen[model_name]

prompt = prompt_format.format(**data_sample)
tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
context_len = tokenized_prompt.shape[-1]
if len(tokenized_prompt) > max_length:
context_len = max_length
half = int(max_length/2)
prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
if subset not in ["trec", "triviaqa", "samsum", "lsht", "lcc", "repobench-p"]:
prompt = build_chat(prompt, model_name)
return prompt, context_len
return prompt


def post_process_pred(pred, subset, model_name):
Expand All @@ -258,4 +240,6 @@ def post_process_pred(pred, subset, model_name):
pred = pred[:pred.find("\nDialogue")]
elif "Phi-3" in model_name and subset == "hotpotqa":
pred = pred.lstrip('\n').split('\n')[0]
elif "Qwen" in model_name and subset == "qasper":
pred = pred.lstrip('\n').split('\n')[0]
return pred

0 comments on commit eefe4f2

Please sign in to comment.