change model to Qwen2

openvinotoolkit · Dec 18, 2024 · eefe4f2 · eefe4f2
1 parent 8831223
commit eefe4f2
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 63 deletions.
diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py
@@ -69,7 +69,7 @@ class CacheOptTestStruct:
 
 
 SHORT_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=32, max_cache_size=96, aggregation_mode=AggregationMode.NORM_SUM)
-LONGBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=64, max_cache_size=256, aggregation_mode=AggregationMode.NORM_SUM)
+LONGBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=128, max_cache_size=672, aggregation_mode=AggregationMode.NORM_SUM)
 
 @pytest.mark.precommit
 @pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="doesn't work on win due to optimum-intel export bug, segfault on mac")
@@ -148,84 +148,81 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
 
 
 @pytest.fixture(scope='module')
-def phi3_converted_model(tmp_path_factory):
-    model_id = "meta-llama/Llama-3.2-3B-Instruct"
+def qwen2_converted_model(tmp_path_factory):
+    model_id = "Qwen/Qwen2-0.5B-Instruct"
     model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
     model.save_pretrained(models_path)
     ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True)
     serialize(ov_tokenizer, models_path / "openvino_tokenizer.xml")
     serialize(ov_detokenizer, models_path / "openvino_detokenizer.xml")
-    phi3_converted_model = ConvertedModel(model, tokenizer, models_path)
-    yield phi3_converted_model
-    del phi3_converted_model
+    qwen2_converted_model = ConvertedModel(model, tokenizer, models_path)
+    yield qwen2_converted_model
+    del qwen2_converted_model
     del model
 
 
-@pytest.mark.precommit
-@pytest.mark.parametrize("subset", ["samsum", "qmsum", "trec", "qasper", "hotpotqa", "repobench-p"])
-def test_unoptimized_generation_longbench(phi3_converted_model, subset):
+@dataclass
+class LongBenchTestData:
+    subset: str
+    ref_score: float
+    max_cache_usage: float
+    avg_cache_usage: float
+
+
+@pytest.mark.parametrize("test_struct", [
+    LongBenchTestData("samsum", 34.96, 16.2, 8.145),
+    LongBenchTestData("trec", 35, 14, 7.284),
+    LongBenchTestData("qasper", 14.67, 22.8, 13.182),
+])
+def test_unoptimized_generation_longbench(qwen2_converted_model, test_struct):
     seqs_per_request = 32
     num_kv_blocks = 1000
     scheduler_config = get_scheduler_config(num_kv_blocks)
-    models_path = phi3_converted_model.models_path
+    models_path = qwen2_converted_model.models_path
     model_name = "/".join(models_path.parts[-2:])
+    subset = test_struct.subset
     max_new_tokens = dataset2maxlen[subset]
-    tokenizer = phi3_converted_model.tokenizer
+    tokenizer = qwen2_converted_model.tokenizer
 
     generation_config = GenerationConfig()  # expecting default greedy sampling
     generation_config.num_return_sequences = 1
     generation_config.max_new_tokens = max_new_tokens
     generation_config.eos_token_id = tokenizer.eos_token_id
 
-    data = datasets.load_dataset('THUDM/LongBench', subset, split='test')
+    scheduler_config.use_cache_eviction = True
+    if scheduler_config.use_cache_eviction:
+        scheduler_config.cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG
 
-    # model_id = "microsoft/Phi-3-mini-4k-instruct"
-    # model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False)
+    model_cb_opt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {})
+    data = datasets.load_dataset('THUDM/LongBench', subset, split='test')
 
-    model_cb_noopt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {})
     with tqdm(total=len(data)) as progress_bar:
         batch = []
         answers = []
         for p_idx, data_sample in enumerate(data):
-            prompt, context_len = preprocess_prompt(tokenizer, data_sample, subset, model_name)
+            prompt = preprocess_prompt(data_sample, subset, model_name)
             progress_bar.update(1)
             batch.append(prompt)
-            answers.append({"context_len": context_len, "answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})
-
-            # input = tokenizer(prompt, truncation=False, return_tensors="pt")
-            # output = model.generate(
-            #     **input,
-            #     max_new_tokens=128,
-            #     num_beams=1,
-            #     do_sample=False,
-            #     temperature=1.0,
-            #     min_length=context_len+1,
-            #     pad_token_id=tokenizer.eos_token_id,
-            #     eos_token_id=[tokenizer.eos_token_id, tokenizer.encode("\n", add_special_tokens=False)[-1]],
-            # )[0]
-            # pred = tokenizer.decode(output[context_len:], skip_special_tokens=True)
-            # pred = post_process_pred(output.m_generation_ids, subset, model_name)
-            # answers[-1]["pred"] = pred
-
-            if (
-                len(batch) == seqs_per_request
-                or p_idx == len(data) - 1
-            ):
-                ans_batch = model_cb_noopt.generate(
+            answers.append({"answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})
+
+            if len(batch) == seqs_per_request or p_idx == len(data) - 1:
+                ans_batch = model_cb_opt.generate(
                     batch, [generation_config] * len(batch)
                 )
                 for i, output in enumerate(ans_batch, start=p_idx-len(batch)+1):
-                    context_len = answers[i]["context_len"]
                     pred = post_process_pred(output.m_generation_ids[0], subset, model_name)
                     answers[i]["pred"] = pred
-
                 batch.clear()
 
     score = evaluate(answers, subset)
     print(f"Score: {score}")
 
-    pipeline_noopt_metrics = model_cb_noopt.get_metrics()
-    print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
-    del model_cb_noopt
+    pipeline_noopt_metrics = model_cb_opt.get_metrics()
+    print(f"Opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
+
+    assert abs(test_struct.ref_score - score) < 1
+    assert abs(test_struct.max_cache_usage - pipeline_noopt_metrics.max_cache_usage) < 1
+    assert abs(test_struct.avg_cache_usage - pipeline_noopt_metrics.avg_cache_usage) < 1
+    del model_cb_opt
diff --git a/tests/python_tests/utils_longbench.py b/tests/python_tests/utils_longbench.py
@@ -144,16 +144,6 @@ def qa_f1_score(prediction, ground_truth, **kwargs):
     "repobench-p": code_sim_score,
 }
 
-# Max length for NVIDIA GeForce RTX 3090 (24 GB)
-model2maxlen = {
-    "meta-llama/Llama-2-7b-chat-hf": 4096,
-    "meta-llama/Meta-Llama-3-8B-Instruct": 5000,
-    "meta-llama/Llama-3.1-8B-Instruct": 10000,
-    "microsoft/Phi-3-mini-4k-instruct": 4096,
-    'meta-llama/Llama-3.2-1B-Instruct': 10000,
-    'meta-llama/Llama-3.2-3B-Instruct': 10000,
-}
-
 dataset2maxlen = {
     "narrativeqa": 128,
     "qasper": 128,
@@ -235,20 +225,12 @@ def build_chat(prompt, model_name):
     return prompt
 
 
-def preprocess_prompt(tokenizer, data_sample, subset, model_name):
+def preprocess_prompt(data_sample, subset, model_name):
     prompt_format = dataset2prompt[subset]
-    max_length = model2maxlen[model_name]
-
     prompt = prompt_format.format(**data_sample)
-    tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
-    context_len = tokenized_prompt.shape[-1]
-    if len(tokenized_prompt) > max_length:
-        context_len = max_length
-        half = int(max_length/2)
-        prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
     if subset not in ["trec", "triviaqa", "samsum", "lsht", "lcc", "repobench-p"]:
         prompt = build_chat(prompt, model_name)
-    return prompt, context_len
+    return prompt
 
 
 def post_process_pred(pred, subset, model_name):
@@ -258,4 +240,6 @@ def post_process_pred(pred, subset, model_name):
         pred = pred[:pred.find("\nDialogue")]
     elif "Phi-3" in model_name and subset == "hotpotqa":
         pred = pred.lstrip('\n').split('\n')[0]
+    elif "Qwen" in model_name and subset == "qasper":
+        pred = pred.lstrip('\n').split('\n')[0]
     return pred