From 9b551006ed5a967e01fc3e45b3d500796ffcf855 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 17 Jan 2024 19:09:54 +0100 Subject: [PATCH 01/15] support for concurrency in llm models --- optimum/intel/openvino/modeling.py | 68 ++++++++++++-------- optimum/intel/openvino/modeling_base.py | 6 +- optimum/intel/openvino/modeling_decoder.py | 42 ++++++++----- tests/openvino/gen_batch.py | 72 ++++++++++++++++++++++ tests/openvino/test_modeling.py | 18 +++--- 5 files changed, 157 insertions(+), 49 deletions(-) create mode 100644 tests/openvino/gen_batch.py diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index f6d3061a7a..881dcd2b9e 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -130,6 +130,7 @@ def to(self, device: str): be in upper or lower case. To speed up first inference, call `.compile()` after `.to()`. """ self._device = device.upper() + self.compiled_model = None self.request = None return self @@ -197,8 +198,10 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data return SequenceClassifierOutput(logits=logits) @@ -263,12 +266,14 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - outputs = self.request(inputs) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() start_logits = ( - torch.from_numpy(outputs["start_logits"]).to(self.device) if not np_inputs else outputs["start_logits"] + torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("start_logits").data ) end_logits = ( - torch.from_numpy(outputs["end_logits"]).to(self.device) if not np_inputs else outputs["end_logits"] + torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("end_logits").data ) return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) @@ -333,8 +338,10 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data return TokenClassifierOutput(logits=logits) @@ -398,11 +405,13 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - outputs = self.request(inputs) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() last_hidden_state = ( - torch.from_numpy(outputs["last_hidden_state"]).to(self.device) + torch.from_numpy(infer_request.get_tensor("last_hidden_state").data).to(self.device) if not np_inputs - else outputs["last_hidden_state"] + else infer_request.get_tensor("last_hidden_state").data ) return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -468,8 +477,10 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data return MaskedLMOutput(logits=logits) @@ -595,8 +606,10 @@ def forward( } # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data return ImageClassifierOutput(logits=logits) @@ -660,8 +673,10 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data return SequenceClassifierOutput(logits=logits) @@ -732,8 +747,10 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data return CausalLMOutput(logits=logits) @@ -813,12 +830,13 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data embeddings = ( - torch.from_numpy(outputs["embeddings"]).to(self.device) if not np_inputs else outputs["embeddings"] + torch.from_numpy(infer_request.get_tensor("embeddings").data).to(self.device) if not np_inputs else infer_request.get_tensor("embeddings").data ) - return XVectorOutput(logits=logits, embeddings=embeddings) @@ -890,7 +908,9 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - outputs = self.request(inputs) - logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data return TokenClassifierOutput(logits=logits) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 05dc3af9b5..deff686e38 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -89,6 +89,7 @@ def __init__( self.model = model self.request = None + self.compiled_model = None if enable_compilation: self.compile() @@ -343,7 +344,7 @@ def _to_load( ) def compile(self): - if self.request is None: + if self.compiled_model is None: logger.info(f"Compiling the model to {self._device} ...") ov_config = {**self.ov_config} if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): @@ -351,7 +352,7 @@ def compile(self): cache_dir = Path(self.model_save_dir).joinpath("model_cache") ov_config["CACHE_DIR"] = str(cache_dir) logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}") - self.request = core.compile_model(self.model, self._device, ov_config) + self.compiled_model = core.compile_model(self.model, self._device, ov_config) # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") @@ -403,6 +404,7 @@ def half(self): apply_moc_transformations(self.model, cf=False) compress_model_transformation(self.model) self.request = None + self.compiled_model = None return self def forward(self, *args, **kwargs): diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 8a2167eae4..b2fb2ed593 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -135,21 +135,13 @@ def __init__( self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.key_value_output_names = [key for key in self.output_names if "present" in key] - self._original_model = self.model.clone() # keep original model for serialization - self._pkv_precision = Type.f32 - self.next_beam_idx = None - self.update_pkv_precision() - if self.is_dynamic: - self.model = self._reshape(self.model, -1, -1) is_stateful_supported = ensure_stateful_is_available(warn=False) - if self.use_cache and not self.stateful: logger.warn( "Provided model does not contain state. It may lead to sub-optimal performance." "Please reexport model with updated OpenVINO version >= 2023.3.0 calling the `from_pretrained` method with original model " "and `export=True` parameter" ) - if self.stateful: if stateful is None: stateful = is_stateful_supported @@ -176,7 +168,13 @@ def raise_error(model_prop, user_prop, name): if use_cache ^ self.use_cache: raise_error(self.use_cache, use_cache, "use_cache") - if enable_compilation: + def init_ov_model(self, compile=True): + self._pkv_precision = Type.f32 + self.update_pkv_precision(force_fp32=False) + if self.is_dynamic: + self.model = self._reshape(self.model, -1, -1) + self._original_model = self.model.clone() # keep original model for serialization + if compile: self.compile() def update_pkv_precision(self, force_fp32=False): @@ -282,9 +280,10 @@ def _from_transformers( config.is_decoder = True config.is_encoder_decoder = False config.save_pretrained(save_dir_path) - return cls._from_pretrained( + model_instance = cls._from_pretrained( model_id=save_dir_path, config=config, use_cache=use_cache, load_in_8bit=False, stateful=None, **kwargs ) + return model_instance def _reshape( self, @@ -322,14 +321,18 @@ def reshape(self, batch_size: int, sequence_length: int): return self def compile(self): - if self.request is None: + if self.compiled_model is None: super().compile() - self.request = self.request.create_infer_request() def _make_stateful(self): patch_stateful(self.config, self.model) self.stateful = True + def create_infer_request(self): + if self.compiled_model is None: + self.compile() + if self.request is None: + self.request = self.compiled_model.create_infer_request() @add_start_docstrings( """ @@ -359,6 +362,8 @@ def forward( **kwargs, ) -> CausalLMOutputWithPast: self.compile() + self.create_infer_request() + if self.use_cache and past_key_values is not None: input_ids = input_ids[:, -1:] @@ -556,8 +561,17 @@ def _from_pretrained( else: init_cls = cls - return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs) - + model_instance = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs) + model_instance.init_ov_model(compile=kwargs.get("compile", True)) + model_instance.request = None + return model_instance + + def clone(self): + model_instance = self.__class__(model=self.model, config=self.config, compile=False) + model_instance.compiled_model = self.compiled_model + model_instance._pkv_precision = self._pkv_precision + model_instance.request = None + return model_instance class OVBloomForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py new file mode 100644 index 0000000000..282fd09921 --- /dev/null +++ b/tests/openvino/gen_batch.py @@ -0,0 +1,72 @@ +from optimum.intel import OVModelForCausalLM +from transformers import AutoTokenizer, AutoConfig, set_seed +import threading +from datetime import datetime + +set_seed(10) +model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf-stateful/pytorch/dldt/compressed_weights/OV_FP16-INT8/" +#model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf/pytorch/dldt/compressed_weights/OV_FP16-INT8/" +tokenizer = AutoTokenizer.from_pretrained(model_path) +tokenizer.pad_token = "[PAD]" +tokenizer.padding_side = "left" + +prompt1 = [" The weather is "] +prompt2 = [" Openvino is a " , " What the the relativity theory "] +prompt3 = [" Are cats smarter that dogs ", " How big is an elephant ", " the water in the ocean is much hotter than before "] +prompts = [prompt1, prompt2, prompt3] +OV_CONFIG = {'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '','NUM_STREAMS': '1'} +model = OVModelForCausalLM.from_pretrained(model_path, config=AutoConfig.from_pretrained(model_path, trust_remote_code=True),ov_config=OV_CONFIG, compile=True) + +NUM_THREADS = 3 + +threads = [None]* NUM_THREADS +results = [None]* NUM_THREADS + +def print_response(t,p,r): + print("THREAD", t) + print("PROMPT:", p) + for answer in r: + print("Answer:") + print(tokenizer.decode(answer, skip_special_tokens=True)) + +def gen_thread(prompt, results, i): + inputs = tokenizer(prompt, return_tensors="pt", padding=True) + generate_kwargs = dict( + input_ids=inputs.input_ids, + max_new_tokens=500, + temperature=1.0, + do_sample=True, + top_p=1.0, + top_k=50, + num_beams=5, + repetition_penalty=1.1 + ) + start = datetime.now() + model_exec = model.clone() + end = datetime.now() + print("cloning model duration", (end - start).total_seconds()*1000000, "us") + outputs = model_exec.generate(**generate_kwargs) + num_tok = 0 + for i in range(len(prompt)): + num_tok += outputs[i].numel() - inputs.get("input_ids")[i].numel() + results[i] = outputs, num_tok + +start = datetime.now() +for i in range(len(threads)): + threads[i] = threading.Thread(target=gen_thread, args=(prompts[i],results, i)) + threads[i].start() + +total_tok = 0 +for i in range(len(threads)): + threads[i].join() + total_tok += results[i][1] +end = datetime.now() + +for i in range(len(threads)): + print_response(i,prompts[i],results[i][0]) + +print("Generation time [s]", ((end-start).total_seconds()), "tokens:", total_tok) +print("Throughput:", total_tok * 60 / ((end-start).total_seconds()), "tokens/min" ) + + + diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index e51b50a5b2..2f9143101c 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -129,9 +129,9 @@ def test_load_from_hub_and_save_model(self): self.assertTrue(manual_openvino_cache_dir.is_dir()) self.assertGreaterEqual(len(list(manual_openvino_cache_dir.glob("*.blob"))), 1) if is_openvino_version("<", "2023.3"): - self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT").name, "THROUGHPUT") + self.assertEqual(loaded_model.compiled_model.get_property("PERFORMANCE_HINT").name, "THROUGHPUT") else: - self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT"), "THROUGHPUT") + self.assertEqual(loaded_model.compiled_model.get_property("PERFORMANCE_HINT"), "THROUGHPUT") with tempfile.TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) @@ -748,7 +748,7 @@ def test_pipeline(self, model_arch): @parameterized.expand(TIMM_MODELS) def test_compare_to_timm(self, model_id): ov_model = OVModelForImageClassification.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) - self.assertEqual(ov_model.request.get_property("INFERENCE_PRECISION_HINT").to_string(), "f32") + self.assertEqual(ov_model.compiled_model.get_property("INFERENCE_PRECISION_HINT").to_string(), "f32") self.assertIsInstance(ov_model.config, PretrainedConfig) timm_model = timm.create_model(model_id, pretrained=True) preprocessor = TimmImageProcessor.from_pretrained(model_id) @@ -886,20 +886,21 @@ def test_compare_with_and_without_past_key_values(self): text = "This is a sample input" tokens = tokenizer(text, return_tensors="pt") - model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=True) + model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG) _ = model_with_pkv.generate(**tokens) # warmup with Timer() as with_pkv_timer: outputs_model_with_pkv = model_with_pkv.generate( **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=False) + model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG) _ = model_without_pkv.generate(**tokens) # warmup with Timer() as without_pkv_timer: outputs_model_without_pkv = model_without_pkv.generate( **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - + print(outputs_model_with_pkv) + print(outputs_model_without_pkv) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) @@ -1201,20 +1202,19 @@ def test_compare_with_and_without_past_key_values(self): question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") - model_with_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=True) + model_with_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG) _ = model_with_pkv.generate(**inputs) # warmup with Timer() as with_pkv_timer: outputs_model_with_pkv = model_with_pkv.generate( **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - model_without_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=False) + model_without_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG) _ = model_without_pkv.generate(**inputs) # warmup with Timer() as without_pkv_timer: outputs_model_without_pkv = model_without_pkv.generate( **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) From 9e4ab17d3b383c9f96fee5c85ee695096a85424f Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 18 Jan 2024 08:38:38 +0100 Subject: [PATCH 02/15] style fixes --- optimum/intel/openvino/modeling.py | 60 +++++++++++++++---- optimum/intel/openvino/modeling_decoder.py | 3 +- optimum/intel/openvino/quantization.py | 2 + tests/openvino/gen_batch.py | 70 +++++++++++++--------- tests/openvino/test_modeling.py | 18 ++++-- 5 files changed, 107 insertions(+), 46 deletions(-) diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 881dcd2b9e..fd3751de01 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -201,7 +201,11 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) return SequenceClassifierOutput(logits=logits) @@ -270,10 +274,14 @@ def forward( infer_request.start_async(inputs) infer_request.wait() start_logits = ( - torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("start_logits").data + torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("start_logits").data ) end_logits = ( - torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("end_logits").data + torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("end_logits").data ) return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) @@ -341,7 +349,11 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) return TokenClassifierOutput(logits=logits) @@ -480,7 +492,11 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) return MaskedLMOutput(logits=logits) @@ -609,7 +625,11 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) return ImageClassifierOutput(logits=logits) @@ -676,7 +696,11 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) return SequenceClassifierOutput(logits=logits) @@ -750,7 +774,11 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) return CausalLMOutput(logits=logits) @@ -833,9 +861,15 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) embeddings = ( - torch.from_numpy(infer_request.get_tensor("embeddings").data).to(self.device) if not np_inputs else infer_request.get_tensor("embeddings").data + torch.from_numpy(infer_request.get_tensor("embeddings").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("embeddings").data ) return XVectorOutput(logits=logits, embeddings=embeddings) @@ -911,6 +945,10 @@ def forward( infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs) infer_request.wait() - logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data + logits = ( + torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + if not np_inputs + else infer_request.get_tensor("logits").data + ) return TokenClassifierOutput(logits=logits) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index b2fb2ed593..1bd57042c8 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -111,7 +111,6 @@ def __init__( "`dynamic_shapes` was set to `False` but static shapes are not supported for causal language model. Please set `dynamic_shapes=True`." ) - enable_compilation = kwargs.get("compile", True) kwargs["compile"] = False # avoid extra compilation in the base class super().__init__( @@ -334,6 +333,7 @@ def create_infer_request(self): if self.request is None: self.request = self.compiled_model.create_infer_request() + @add_start_docstrings( """ OpenVINO Model with a causal language modeling head on top (linear layer with weights tied to the input @@ -573,6 +573,7 @@ def clone(self): model_instance.request = None return model_instance + class OVBloomForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 3da486c45c..fd517b3c88 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -294,6 +294,8 @@ def _quantize_ovcausallm( # Prefeth past_key_values self.model.update_pkv_precision(True) self.model.compile() + self.model.create_infer_request() + subset_size = kwargs.get("subset_size", 300) data_cache = [] diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py index 282fd09921..5b92ad3b06 100644 --- a/tests/openvino/gen_batch.py +++ b/tests/openvino/gen_batch.py @@ -1,59 +1,74 @@ -from optimum.intel import OVModelForCausalLM -from transformers import AutoTokenizer, AutoConfig, set_seed import threading -from datetime import datetime +from datetime import datetime + +from transformers import AutoConfig, AutoTokenizer, set_seed + +from optimum.intel import OVModelForCausalLM + set_seed(10) model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf-stateful/pytorch/dldt/compressed_weights/OV_FP16-INT8/" -#model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf/pytorch/dldt/compressed_weights/OV_FP16-INT8/" +# model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf/pytorch/dldt/compressed_weights/OV_FP16-INT8/" tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer.pad_token = "[PAD]" tokenizer.padding_side = "left" prompt1 = [" The weather is "] -prompt2 = [" Openvino is a " , " What the the relativity theory "] -prompt3 = [" Are cats smarter that dogs ", " How big is an elephant ", " the water in the ocean is much hotter than before "] +prompt2 = [" Openvino is a ", " What the the relativity theory "] +prompt3 = [ + " Are cats smarter that dogs ", + " How big is an elephant ", + " the water in the ocean is much hotter than before ", +] prompts = [prompt1, prompt2, prompt3] -OV_CONFIG = {'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '','NUM_STREAMS': '1'} -model = OVModelForCausalLM.from_pretrained(model_path, config=AutoConfig.from_pretrained(model_path, trust_remote_code=True),ov_config=OV_CONFIG, compile=True) +OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": "", "NUM_STREAMS": "1"} +model = OVModelForCausalLM.from_pretrained( + model_path, + config=AutoConfig.from_pretrained(model_path, trust_remote_code=True), + ov_config=OV_CONFIG, + compile=True, +) NUM_THREADS = 3 -threads = [None]* NUM_THREADS -results = [None]* NUM_THREADS +threads = [None] * NUM_THREADS +results = [None] * NUM_THREADS + -def print_response(t,p,r): +def print_response(t, p, r): print("THREAD", t) print("PROMPT:", p) for answer in r: print("Answer:") print(tokenizer.decode(answer, skip_special_tokens=True)) + def gen_thread(prompt, results, i): inputs = tokenizer(prompt, return_tensors="pt", padding=True) - generate_kwargs = dict( - input_ids=inputs.input_ids, - max_new_tokens=500, - temperature=1.0, - do_sample=True, - top_p=1.0, - top_k=50, - num_beams=5, - repetition_penalty=1.1 - ) + generate_kwargs = { + "input_ids": inputs.input_ids, + "max_new_tokens": 200, + "temperature": 1.0, + "do_sample": True, + "top_p": 1.0, + "top_k": 50, + "num_beams" :5, + "repetition_penalty": 1.1 + } start = datetime.now() model_exec = model.clone() end = datetime.now() - print("cloning model duration", (end - start).total_seconds()*1000000, "us") + print("cloning model duration", (end - start).total_seconds() * 1000000, "us") outputs = model_exec.generate(**generate_kwargs) num_tok = 0 for i in range(len(prompt)): num_tok += outputs[i].numel() - inputs.get("input_ids")[i].numel() results[i] = outputs, num_tok + start = datetime.now() for i in range(len(threads)): - threads[i] = threading.Thread(target=gen_thread, args=(prompts[i],results, i)) + threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) threads[i].start() total_tok = 0 @@ -63,10 +78,7 @@ def gen_thread(prompt, results, i): end = datetime.now() for i in range(len(threads)): - print_response(i,prompts[i],results[i][0]) - -print("Generation time [s]", ((end-start).total_seconds()), "tokens:", total_tok) -print("Throughput:", total_tok * 60 / ((end-start).total_seconds()), "tokens/min" ) - - + print_response(i, prompts[i], results[i][0]) +print("Generation time [s]", ((end - start).total_seconds()), "tokens:", total_tok) +print("Throughput:", total_tok * 60 / ((end - start).total_seconds()), "tokens/min") diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2f9143101c..c1746f5538 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -886,21 +886,25 @@ def test_compare_with_and_without_past_key_values(self): text = "This is a sample input" tokens = tokenizer(text, return_tensors="pt") - model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG) + model_with_pkv = OVModelForSeq2SeqLM.from_pretrained( + model_id, export=True, use_cache=True, ov_config=F32_CONFIG + ) _ = model_with_pkv.generate(**tokens) # warmup with Timer() as with_pkv_timer: outputs_model_with_pkv = model_with_pkv.generate( **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG) + model_without_pkv = OVModelForSeq2SeqLM.from_pretrained( + model_id, export=True, use_cache=False, ov_config=F32_CONFIG + ) _ = model_without_pkv.generate(**tokens) # warmup with Timer() as without_pkv_timer: outputs_model_without_pkv = model_without_pkv.generate( **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) print(outputs_model_with_pkv) - print(outputs_model_without_pkv) + print(outputs_model_without_pkv) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) @@ -1202,14 +1206,18 @@ def test_compare_with_and_without_past_key_values(self): question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") - model_with_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG) + model_with_pkv = OVModelForPix2Struct.from_pretrained( + model_id, export=True, use_cache=True, ov_config=F32_CONFIG + ) _ = model_with_pkv.generate(**inputs) # warmup with Timer() as with_pkv_timer: outputs_model_with_pkv = model_with_pkv.generate( **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - model_without_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG) + model_without_pkv = OVModelForPix2Struct.from_pretrained( + model_id, export=True, use_cache=False, ov_config=F32_CONFIG + ) _ = model_without_pkv.generate(**inputs) # warmup with Timer() as without_pkv_timer: outputs_model_without_pkv = model_without_pkv.generate( From dcb2a8fddc2833fdd9f82989ab5554aac787f012 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 19 Jan 2024 12:01:11 +0100 Subject: [PATCH 03/15] concurrency in seq2seq and stable diffusion classes --- optimum/intel/openvino/modeling_diffusion.py | 36 ++++++++---- optimum/intel/openvino/modeling_seq2seq.py | 31 +++++----- tests/openvino/gen_batch.py | 4 +- tests/openvino/gen_img.py | 59 ++++++++++++++++++++ tests/openvino/gen_seq2seq.py | 51 +++++++++++++++++ tests/openvino/test_stable_diffusion.py | 15 +++-- 6 files changed, 161 insertions(+), 35 deletions(-) create mode 100644 tests/openvino/gen_img.py create mode 100644 tests/openvino/gen_seq2seq.py diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index fa48a5df68..8454acc3d9 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -532,7 +532,7 @@ def __init__( for inputs in self.model.inputs } self.ov_config = ov_config or {**self.parent_model.ov_config} - self.request = None + self.compiled_model = None self._model_name = model_name self._model_dir = Path(model_dir or parent_model._model_save_dir) config_path = self._model_dir / model_name / self.CONFIG_NAME @@ -541,13 +541,13 @@ def __init__( self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") def _compile(self): - if self.request is None: + if self.compiled_model is None: logger.info(f"Compiling the {self._model_name} to {self.device} ...") - self.request = core.compile_model(self.model, self.device, self.ov_config) + self.compiled_model = core.compile_model(self.model, self.device, self.ov_config) # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self.device} SUPPORTED_PROPERTIES:") - _print_compiled_model_properties(self.request) + _print_compiled_model_properties(self.compiled_model) @property def device(self): @@ -570,8 +570,11 @@ def __call__(self, input_ids: np.ndarray): inputs = { "input_ids": input_ids, } - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs, share_inputs=True) + infer_request.wait() + outputs = [infer_request.get_tensor(output).data for output in infer_request.results] + return outputs class OVModelUnet(OVModelPart): @@ -604,8 +607,11 @@ def __call__( if timestep_cond is not None: inputs["timestep_cond"] = timestep_cond - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs, share_inputs=True) + infer_request.wait() + outputs = [infer_request.get_tensor(output).data for output in infer_request.results] + return outputs class OVModelVaeDecoder(OVModelPart): @@ -620,8 +626,11 @@ def __call__(self, latent_sample: np.ndarray): inputs = { "latent_sample": latent_sample, } - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs, share_inputs=True) + infer_request.wait() + outputs = [infer_request.results[output].data for output in infer_request.results] + return outputs def _compile(self): if "GPU" in self.device: @@ -641,8 +650,11 @@ def __call__(self, sample: np.ndarray): inputs = { "sample": sample, } - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs, share_inputs=True) + infer_request.wait() + outputs = [infer_request.get_tensor(output).data for output in infer_request.results] + return outputs def _compile(self): if "GPU" in self.device: diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 9a7f913ab2..db05e0acc9 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -442,7 +442,7 @@ def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict, self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} self.main_input_name = main_input_name self.ov_config = ov_config - self.request = None + self.compiled_model = None @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING) def forward( @@ -461,9 +461,10 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - last_hidden_state = torch.from_numpy( - self.request(inputs, share_inputs=True, share_outputs=True)["last_hidden_state"] - ).to(self.device) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs, share_inputs=True) + infer_request.wait() + last_hidden_state = torch.from_numpy(infer_request.get_tensor("last_hidden_state").data).to(self.device) return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -471,9 +472,9 @@ def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) def _compile(self): - if self.request is None: + if self.compiled_model is None: logger.info(f"Compiling the encoder to {self._device} ...") - self.request = core.compile_model(self.model, self._device, self.ov_config) + self.compiled_model = core.compile_model(self.model, self._device, self.ov_config) # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") @@ -509,7 +510,7 @@ def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict): self.num_pkv = 4 self.ov_config = ov_config - self.request = None + self.compiled_model = None @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING) def forward( @@ -546,13 +547,14 @@ def forward( if "decoder_attention_mask" in self.input_names and decoder_attention_mask is not None: inputs["decoder_attention_mask"] = decoder_attention_mask # Run inference - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs, share_inputs=True) + infer_request.wait() + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the # self-attention layer and 2 to the cross-attention layer) - out_past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + out_past_key_values = tuple(infer_request.get_tensor(key).data for key in self.key_value_output_names) # Tuple of tuple of length `n_layers`, with each tuple of length equal to: # * 4 for the decoder without cache (k/v of self-attention + k/v of cross-attention) @@ -574,14 +576,13 @@ def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) def _compile(self): - if self.request is None: + if self.compiled_model is None: logger.info(f"Compiling the decoder to {self._device} ...") - compiled_model = core.compile_model(self.model, self._device, self.ov_config) - self.request = compiled_model.create_infer_request() + self.compiled_model = core.compile_model(self.model, self._device, self.ov_config) # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") - _print_compiled_model_properties(compiled_model) + _print_compiled_model_properties(self.compiled_model) @add_start_docstrings( diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py index 5b92ad3b06..5d165217ec 100644 --- a/tests/openvino/gen_batch.py +++ b/tests/openvino/gen_batch.py @@ -52,8 +52,8 @@ def gen_thread(prompt, results, i): "do_sample": True, "top_p": 1.0, "top_k": 50, - "num_beams" :5, - "repetition_penalty": 1.1 + "num_beams": 5, + "repetition_penalty": 1.1, } start = datetime.now() model_exec = model.clone() diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py new file mode 100644 index 0000000000..72a6c97439 --- /dev/null +++ b/tests/openvino/gen_img.py @@ -0,0 +1,59 @@ +import datetime +import threading + +from diffusers import DDIMScheduler + +from optimum.intel.openvino import OVStableDiffusionPipeline + + +MODEL_PATH = "/home/devuser/model_server/demos/python_demos/stable_diffusion/model" +OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"} + + +pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="CPU", ov_config=OV_CONFIG) +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + + +# set_seed(10) + + +prompt1 = [" Zebras in space "] +prompt2 = [" The statue of liberty in New York", " Big Ben in London "] +prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"] +prompts = [prompt1, prompt2, prompt3] + +NUM_THREADS = 3 + +threads = [None] * NUM_THREADS +results = [None] * NUM_THREADS + + +def save_response(t, p, r): + print("THREAD", t) + print("PROMPT:", p) + for i in range(len(r)): + print("IMG:", i) + r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG") + + +def gen_thread(prompt, results, i): + text = prompt + images = pipe(text).images + results[i] = images + + +start = datetime.datetime.now() +for i in range(len(threads)): + threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) + threads[i].start() +nu_img = 0 +for i in range(len(threads)): + threads[i].join() + nu_img += len(results[i]) +end = datetime.datetime.now() + +for i in range(len(threads)): + save_response(i, prompts[i], results[i]) + +print("Generation time [s]", ((end - start).total_seconds()), "images:", nu_img) +print("Throughput:", nu_img * 60 / ((end - start).total_seconds()), "images/min") diff --git a/tests/openvino/gen_seq2seq.py b/tests/openvino/gen_seq2seq.py new file mode 100644 index 0000000000..27d3ed2a45 --- /dev/null +++ b/tests/openvino/gen_seq2seq.py @@ -0,0 +1,51 @@ +import datetime +import threading + +from transformers import AutoTokenizer, pipeline + +from optimum.intel import OVModelForSeq2SeqLM + + +model_id = "echarlaix/t5-small-openvino" +model = OVModelForSeq2SeqLM.from_pretrained(model_id) +tokenizer = AutoTokenizer.from_pretrained(model_id) +pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) + +prompt1 = ["I live in Europe"] +prompt2 = ["What is your name?", "The dog is very happy"] +prompt3 = ["It's a beautiful weather today", "Yes", "Good morning"] +prompts = [prompt1, prompt2, prompt3] + +NUM_THREADS = 3 + +threads = [None] * NUM_THREADS +results = [None] * NUM_THREADS + + +def print_response(t, p, r): + print("THREAD", t) + print("PROMPT:", p) + for i in range(len(r)): + print("TRANSLATION", i, ":", r[i]["translation_text"]) + + +def gen_thread(prompt, results, i): + translations = pipe(prompt) + results[i] = translations + + +start = datetime.datetime.now() +for i in range(len(threads)): + threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) + threads[i].start() +nu_trans = 0 +for i in range(len(threads)): + threads[i].join() + nu_trans += len(results[i]) +end = datetime.datetime.now() + +for i in range(len(threads)): + print_response(i, prompts[i], results[i]) + +print("Generation time [s]", ((end - start).total_seconds()), "translations:", nu_trans) +print("Throughput:", nu_trans / ((end - start).total_seconds()), "translations/s") diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py index 1fce7c3fc8..2dc56fbe4b 100644 --- a/tests/openvino/test_stable_diffusion.py +++ b/tests/openvino/test_stable_diffusion.py @@ -55,6 +55,9 @@ from optimum.utils.import_utils import _diffusers_version +F32_CONFIG = {"CACHE_DIR": "", "INFERENCE_PRECISION_HINT": "f32"} + + def _generate_inputs(batch_size=1): inputs = { "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, @@ -208,7 +211,7 @@ class OVStableDiffusionPipelineTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_diffusers(self, model_arch: str): model_id = MODEL_NAMES[model_arch] - ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True) + ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) @@ -361,7 +364,7 @@ class OVtableDiffusionXLPipelineTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_diffusers(self, model_arch: str): - ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) + ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG) self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) self.assertIsInstance(ov_pipeline.text_encoder_2, OVModelTextEncoder) self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) @@ -407,7 +410,7 @@ def test_image_reproducibility(self, model_arch: str): # Verify every subcomponent is compiled by default for component in {"unet", "vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"}: - self.assertIsInstance(getattr(pipeline, component).request, CompiledModel) + self.assertIsInstance(getattr(pipeline, component).compiled_model, CompiledModel) batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128 inputs = _generate_inputs(batch_size) @@ -447,11 +450,11 @@ class OVStableDiffusionXLImg2ImgPipelineTest(unittest.TestCase): def test_inference(self): model_id = "hf-internal-testing/tiny-stable-diffusion-xl-pipe" - pipeline = self.MODEL_CLASS.from_pretrained(model_id) + pipeline = self.MODEL_CLASS.from_pretrained(model_id, ov_config=F32_CONFIG) with tempfile.TemporaryDirectory() as tmp_dir: pipeline.save_pretrained(tmp_dir) - pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir) + pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir, ov_config=F32_CONFIG) batch_size, height, width = 1, 128, 128 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) @@ -488,7 +491,7 @@ class OVLatentConsistencyModelPipelineTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES) @unittest.skipIf(parse(_diffusers_version) <= Version("0.21.4"), "not supported with this diffusers version") def test_compare_to_diffusers(self, model_arch: str): - ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) + ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG) self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) From e189e4055a5c8bdd3676c1a9733324d1f6236b4a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 25 Jan 2024 10:23:42 +0100 Subject: [PATCH 04/15] concurrency via model cloning in encoders and decoders --- optimum/intel/openvino/modeling.py | 112 +++++++++++++++--------- optimum/intel/openvino/modeling_base.py | 16 +++- 2 files changed, 84 insertions(+), 44 deletions(-) diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index e7f10fb7dd..b6242b0fa2 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -15,7 +15,7 @@ import os from pathlib import Path from typing import Optional, Union - +import time import numpy as np import openvino import torch @@ -180,6 +180,7 @@ def forward( **kwargs, ): self.compile() + self.create_infer_request() np_inputs = isinstance(input_ids, np.ndarray) if not np_inputs: @@ -197,13 +198,15 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) logits = ( - torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("logits").data + else self.request.get_tensor("logits").data ) return SequenceClassifierOutput(logits=logits) @@ -252,6 +255,7 @@ def forward( **kwargs, ): self.compile() + self.create_infer_request() np_inputs = isinstance(input_ids, np.ndarray) if not np_inputs: @@ -269,18 +273,21 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) + start_logits = ( - torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("start_logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("start_logits").data + else self.request.get_tensor("start_logits").data ) end_logits = ( - torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("end_logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("end_logits").data + else self.request.get_tensor("end_logits").data ) return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) @@ -328,6 +335,7 @@ def forward( **kwargs, ): self.compile() + self.create_infer_request() np_inputs = isinstance(input_ids, np.ndarray) if not np_inputs: @@ -345,13 +353,15 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec == True: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) logits = ( - torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("logits").data + else self.request.get_tensor("logits").data ) return TokenClassifierOutput(logits=logits) @@ -399,6 +409,7 @@ def forward( **kwargs, ): self.compile() + self.create_infer_request() np_inputs = isinstance(input_ids, np.ndarray) if not np_inputs: @@ -416,13 +427,15 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec == True: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) last_hidden_state = ( - torch.from_numpy(infer_request.get_tensor("last_hidden_state").data).to(self.device) + torch.from_numpy(self.request.get_tensor("last_hidden_state").data).to(self.device) if not np_inputs - else infer_request.get_tensor("last_hidden_state").data + else self.request.get_tensor("last_hidden_state").data ) return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -471,6 +484,7 @@ def forward( **kwargs, ): self.compile() + self.create_infer_request() np_inputs = isinstance(input_ids, np.ndarray) if not np_inputs: @@ -488,13 +502,15 @@ def forward( inputs["token_type_ids"] = token_type_ids # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec == True: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) logits = ( - torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("logits").data + else self.request.get_tensor("logits").data ) return MaskedLMOutput(logits=logits) @@ -611,6 +627,7 @@ def forward( **kwargs, ): self.compile() + self.create_infer_request() np_inputs = isinstance(pixel_values, np.ndarray) if not np_inputs: @@ -621,13 +638,15 @@ def forward( } # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec == True: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) logits = ( - torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("logits").data + else self.request.get_tensor("logits").data ) return ImageClassifierOutput(logits=logits) @@ -677,6 +696,7 @@ def forward( **kwargs, ): self.compile() + self.create_infer_request() np_inputs = isinstance(input_values, np.ndarray) if not np_inputs: @@ -692,13 +712,15 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec == True: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) logits = ( - torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("logits").data + else self.request.get_tensor("logits").data ) return SequenceClassifierOutput(logits=logits) @@ -756,6 +778,8 @@ def forward( attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): + self.compile() + self.create_infer_request() np_inputs = isinstance(input_values, np.ndarray) if not np_inputs: input_values = np.array(input_values) @@ -770,13 +794,15 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs) - infer_request.wait() + if self.async_exec == True: + self.request.start_async(inputs) + self.request.wait() + else: + self.request.infer(inputs) logits = ( - torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if not np_inputs - else infer_request.get_tensor("logits").data + else self.request.get_tensor("logits").data ) return CausalLMOutput(logits=logits) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index deff686e38..e65bdddb0c 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -89,6 +89,7 @@ def __init__( self.model = model self.request = None + self.async_exec = False self.compiled_model = None if enable_compilation: self.compile() @@ -356,7 +357,11 @@ def compile(self): # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") - _print_compiled_model_properties(self.request) + _print_compiled_model_properties(self.compiled_model) + + def create_infer_request(self): + if self.request is None: + self.request = self.compiled_model.create_infer_request() def _reshape( self, @@ -394,6 +399,7 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid """ self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False self.model = self._reshape(self.model, batch_size, sequence_length, height, width) + self.compiled_model = None self.request = None return self @@ -409,6 +415,14 @@ def half(self): def forward(self, *args, **kwargs): raise NotImplementedError + + def clone(self): + self.compile() + model_cloned = self.__class__(self.model, config=self.config, compile=False) + model_cloned.compiled_model = self.compiled_model + model_cloned.async_exec = True + model_cloned._device = self._device + return model_cloned def can_generate(self) -> bool: """ From f75021bfeb59861578e7bab3174a711a79c2b5fb Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 25 Jan 2024 12:44:52 +0100 Subject: [PATCH 05/15] fix clone performance --- optimum/intel/openvino/modeling_base.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 2d9b6b5ea9..e6fa93f2cf 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -17,7 +17,6 @@ from pathlib import Path from tempfile import TemporaryDirectory, gettempdir from typing import Dict, Optional, Union - import openvino from huggingface_hub import hf_hub_download from openvino import Core, convert_model @@ -69,12 +68,10 @@ def __init__( self.ov_config = ov_config if ov_config is not None else {"PERFORMANCE_HINT": "LATENCY"} self.preprocessors = kwargs.get("preprocessors", []) enable_compilation = kwargs.get("compile", True) - if self.is_dynamic: height = -1 if self.export_feature == "image-classification" else None width = -1 if self.export_feature == "image-classification" else None model = self._reshape(model, -1, -1, height, width) - input_names = {} for idx, key in enumerate(model.inputs): names = tuple(key.get_names()) @@ -86,14 +83,12 @@ def __init__( names = tuple(key.get_names()) output_names[next((name for name in names if "/" not in name), names[0])] = idx self.output_names = output_names - self.model = model self.request = None self.async_exec = False self.compiled_model = None if enable_compilation: self.compile() - if is_transformers_version("<=", "4.25.1"): self.generation_config = None else: @@ -422,7 +417,7 @@ def forward(self, *args, **kwargs): def clone(self): self.compile() - model_cloned = self.__class__(self.model, config=self.config, compile=False) + model_cloned = self.__class__(self.model, config=self.config, compile=False, dynamic_shapes=False) model_cloned.compiled_model = self.compiled_model model_cloned.async_exec = True model_cloned._device = self._device From e9bb94116f7e2bd120f8534a3fbb9539ce0fd6df Mon Sep 17 00:00:00 2001 From: mzegla Date: Fri, 26 Jan 2024 15:15:28 +0100 Subject: [PATCH 06/15] init --- tests/openvino/test_modeling.py | 94 ++++++++++++++++++++++++++++++++- tests/openvino/utils_tests.py | 22 ++++++++ 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 74f033b95e..5fcc368252 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -26,6 +26,7 @@ from datasets import load_dataset from evaluate import evaluator from parameterized import parameterized +import threading from PIL import Image from transformers import ( AutoFeatureExtractor, @@ -50,7 +51,7 @@ set_seed, ) from transformers.onnx.utils import get_preprocessor -from utils_tests import MODEL_NAMES +from utils_tests import MODEL_NAMES, OVThread from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel import ( @@ -253,7 +254,7 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs = "This is a sample input" + inputs = "This is sample input." tokens = tokenizer(inputs, return_tensors="pt") with torch.no_grad(): transformers_outputs = transformers_model(**tokens) @@ -268,6 +269,51 @@ def test_compare_to_transformers(self, model_arch): del ov_model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + ov_model = OVModelForSequenceClassification.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + self.assertIsInstance(ov_model.config, PretrainedConfig) + transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs_list = ["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.", + "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen", + "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.", + "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",] + tokens_list = [] + for inputs in inputs_list: + tokens_list.append(tokenizer(inputs, return_tensors="pt")) + + transformers_outputs_list = [] + for tokens in tokens_list: + with torch.no_grad(): + transformers_outputs_list.append(transformers_model(**tokens)) + + def run_ov_model(inputs, transformers_outputs): + for input_type in ["pt", "np"]: + tokens = tokenizer(inputs, return_tensors=input_type) + ov_outputs = ov_model(**tokens) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + # Compare tensor outputs + close_enough = torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4) + #print(f"[{threading.get_ident()}] OV model logits: {ov_outputs.logits} Torch model logits: {transformers_outputs.logits} Close enough: {close_enough}") + self.assertTrue(close_enough) + + NUM_THREADS=4 + threads = [] + for i in range(NUM_THREADS): + threads.append(OVThread(target=run_ov_model, args=[inputs_list[i], transformers_outputs_list[i]])) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + del transformers_model + del ov_model + gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -524,6 +570,50 @@ def test_compare_to_transformers(self, model_arch): del ov_model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + self.assertIsInstance(ov_model.config, PretrainedConfig) + self.assertTrue(ov_model.use_cache) + self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode") + transformers_model = AutoModelForCausalLM.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs_list = ["This is a sample", "The sky is blue", "The money comes from", "Some other random sample"] + tokens_list = [tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None) for inputs in inputs_list] + + def run_ov_model(tokens): + position_ids = None + if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: + input_shape = tokens["input_ids"].shape + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) + ov_outputs = ov_model(**tokens, position_ids=position_ids) + + self.assertTrue("logits" in ov_outputs) + self.assertIsInstance(ov_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in ov_outputs) + self.assertIsInstance(ov_outputs.past_key_values, tuple) + if self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode": + self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) + with torch.no_grad(): + transformers_outputs = transformers_model(**tokens) + # Compare tensor outputs + self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + + NUM_THREADS=4 + threads = [] + for i in range(NUM_THREADS): + threads.append(OVThread(target=run_ov_model, args=[tokens_list[i]])) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + del transformers_model + del ov_model + gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 6cfeb29bb4..f8bfc1cc97 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -14,6 +14,7 @@ import numpy as np import torch +import threading MODEL_NAMES = { @@ -132,3 +133,24 @@ def get_num_quantized_nodes(ov_model): if "4" in elem.get_output_element_type(i).get_type_name(): num_int4 += 1 return num_fake_quantize, num_int8, num_int4 + + +### Multithreading + +class OVThread(threading.Thread): + def __init__(self, target, args): + super().__init__() + self.target = target + self.args = args + + def run(self): + self.exception = None + try: + self.target(*self.args) + except Exception as e: + self.exception = e + + def join(self): + super().join() + if self.exception: + raise self.exception \ No newline at end of file From 5f85cbb7ccd8fc128b774a91073680221b6544f8 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 26 Jan 2024 15:44:36 +0100 Subject: [PATCH 07/15] fix next_beam_idx initialization --- optimum/intel/openvino/modeling_decoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 1bd57042c8..88498d8c22 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -134,6 +134,7 @@ def __init__( self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.key_value_output_names = [key for key in self.output_names if "present" in key] + self.next_beam_idx = None is_stateful_supported = ensure_stateful_is_available(warn=False) if self.use_cache and not self.stateful: logger.warn( From 912cf3abf365528bef68f050e140d39edfbc79b9 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 26 Jan 2024 16:08:43 +0100 Subject: [PATCH 08/15] init version --- optimum/intel/openvino/modeling_diffusion.py | 55 +++++++++++++++----- tests/openvino/gen_img.py | 15 ++++-- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index f5bb5a72af..81d0f2bdc8 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -512,6 +512,24 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs): def _save_config(self, save_directory): self.save_config(save_directory) + def clone(self): + self.compile() + pipe_cloned = self.__class__( + self.unet.model, + self._internal_dict, + scheduler=self.scheduler, + vae_decoder=self.vae_decoder.model, + vae_encoder=self.vae_encoder.model, + text_encoder=self.text_encoder.model, +# text_encoder_2=self.text_encoder_2.clone(), + tokenizer=self.tokenizer, + tokenizer_2=self.tokenizer_2, + feature_extractor=self.feature_extractor) + pipe_cloned.unet = self.unet.clone() + pipe_cloned.text_encoder = self.text_encoder.clone() + pipe_cloned.vae_decoder = self.vae_decoder.clone() + pipe_cloned.vae_encoder = self.vae_encoder.clone() + return pipe_cloned class OVModelPart: CONFIG_NAME = "config.json" @@ -533,6 +551,7 @@ def __init__( } self.ov_config = ov_config or {**self.parent_model.ov_config} self.compiled_model = None + self.request = None self._model_name = model_name self._model_dir = Path(model_dir or parent_model._model_save_dir) config_path = self._model_dir / model_name / self.CONFIG_NAME @@ -554,6 +573,16 @@ def _compile(self): logger.info(f"{self.device} SUPPORTED_PROPERTIES:") _print_compiled_model_properties(self.compiled_model) + def create_infer_request(self): + if self.request is None: + self.request = self.compiled_model.create_infer_request() + + def clone(self): + model_cloned = self.__class__(self.model, self.parent_model, ov_config=self.ov_config) + model_cloned.model = self.model + model_cloned.compiled_model = self.compiled_model + return model_cloned + @property def device(self): return self.parent_model._device @@ -571,16 +600,16 @@ def __init__( def __call__(self, input_ids: np.ndarray): self._compile() + self.create_infer_request() inputs = { "input_ids": input_ids, } - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs, share_inputs=True) - infer_request.wait() - outputs = [infer_request.get_tensor(output).data for output in infer_request.results] + self.request.start_async(inputs, share_inputs=True) + self.request.wait() + outputs = [self.request.get_tensor(output).data for output in self.request.results] return outputs - + class OVModelUnet(OVModelPart): def __init__( @@ -598,6 +627,7 @@ def __call__( timestep_cond: Optional[np.ndarray] = None, ): self._compile() + self.create_infer_request() inputs = { "sample": sample, @@ -612,10 +642,9 @@ def __call__( if timestep_cond is not None: inputs["timestep_cond"] = timestep_cond - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs, share_inputs=True) - infer_request.wait() - outputs = [infer_request.get_tensor(output).data for output in infer_request.results] + self.request.start_async(inputs, share_inputs=True) + self.request.request.wait() + outputs = [self.request.get_tensor(output).data for output in self.results] return outputs @@ -627,14 +656,14 @@ def __init__( def __call__(self, latent_sample: np.ndarray): self._compile() + self.create_infer_request() inputs = { "latent_sample": latent_sample, } - infer_request = self.compiled_model.create_infer_request() - infer_request.start_async(inputs, share_inputs=True) - infer_request.wait() - outputs = [infer_request.results[output].data for output in infer_request.results] + self.request.start_async(inputs, share_inputs=True) + self.request.wait() + outputs = [self.request.results[output].data for output in self.request.results] return outputs def _compile(self): diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py index 6e06132ba5..9f17b7a1e6 100644 --- a/tests/openvino/gen_img.py +++ b/tests/openvino/gen_img.py @@ -6,19 +6,25 @@ from optimum.intel.openvino import OVStableDiffusionPipeline -MODEL_PATH = "/model" +MODEL_PATH = "/home/devuser/model_server/demos/python_demos/stable_diffusion/model" OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"} pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="CPU", ov_config=OV_CONFIG) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) +vae_decoder_clon = pipe.vae_decoder.clone() +print(vae_decoder_clon, vae_decoder_clon.model) + +unet_clon = pipe.unet.clone() +print(unet_clon, unet_clon.model) + prompt1 = [" Zebras in space "] prompt2 = [" The statue of liberty in New York", " Big Ben in London "] prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"] prompts = [prompt1, prompt2, prompt3] -NUM_THREADS = 3 +NUM_THREADS = 1 threads = [None] * NUM_THREADS results = [None] * NUM_THREADS @@ -33,8 +39,11 @@ def save_response(t, p, r): def gen_thread(prompt, results, i): + print("cloning pipe") + pipe_exec = pipe.clone() + print("pipe cloned") text = prompt - images = pipe(text).images + images = pipe_exec(text).images results[i] = images From 03b548d3d8e5b372521d4ab280f172d017d38e2c Mon Sep 17 00:00:00 2001 From: mzegla Date: Mon, 29 Jan 2024 17:22:19 +0100 Subject: [PATCH 09/15] more tests --- tests/openvino/test_modeling.py | 177 +++++++++++++++++++----- tests/openvino/test_stable_diffusion.py | 53 ++++++- tests/openvino/utils_tests.py | 13 +- 3 files changed, 206 insertions(+), 37 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 5fcc368252..5e4767c8c4 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -26,7 +26,7 @@ from datasets import load_dataset from evaluate import evaluator from parameterized import parameterized -import threading +import pytest from PIL import Image from transformers import ( AutoFeatureExtractor, @@ -51,7 +51,7 @@ set_seed, ) from transformers.onnx.utils import get_preprocessor -from utils_tests import MODEL_NAMES, OVThread +from utils_tests import MODEL_NAMES, run_on_multiple_threads from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel import ( @@ -270,6 +270,7 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.skip(reason="Clone operation not implemented for this class of models") def test_compare_to_transformers_multithreading(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) @@ -277,23 +278,19 @@ def test_compare_to_transformers_multithreading(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs_list = ["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.", - "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen", - "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.", - "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",] - tokens_list = [] - for inputs in inputs_list: - tokens_list.append(tokenizer(inputs, return_tensors="pt")) - - transformers_outputs_list = [] - for tokens in tokens_list: - with torch.no_grad(): - transformers_outputs_list.append(transformers_model(**tokens)) + inputs_list = [["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."], + ["This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen"], + ["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."], + ["This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",]] - def run_ov_model(inputs, transformers_outputs): + def run_ov_model(inputs): + tokens = tokenizer(inputs, return_tensors="pt") + with torch.no_grad(): + transformers_outputs = transformers_model(**tokens) + ov_model_instance = ov_model.clone() for input_type in ["pt", "np"]: tokens = tokenizer(inputs, return_tensors=input_type) - ov_outputs = ov_model(**tokens) + ov_outputs = ov_model_instance(**tokens) self.assertIn("logits", ov_outputs) self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) # Compare tensor outputs @@ -301,14 +298,7 @@ def run_ov_model(inputs, transformers_outputs): #print(f"[{threading.get_ident()}] OV model logits: {ov_outputs.logits} Torch model logits: {transformers_outputs.logits} Close enough: {close_enough}") self.assertTrue(close_enough) - NUM_THREADS=4 - threads = [] - for i in range(NUM_THREADS): - threads.append(OVThread(target=run_ov_model, args=[inputs_list[i], transformers_outputs_list[i]])) - for thread in threads: - thread.start() - for thread in threads: - thread.join() + run_on_multiple_threads(run_ov_model, args_list=inputs_list) del transformers_model del ov_model @@ -580,15 +570,16 @@ def test_compare_to_transformers_multithreading(self, model_arch): self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode") transformers_model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs_list = ["This is a sample", "The sky is blue", "The money comes from", "Some other random sample"] - tokens_list = [tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None) for inputs in inputs_list] + inputs_list = ["This is a sample", "Here is another sample", "That's the thrid one", "This is the last sample"] + tokens_list = [[tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None)] for inputs in inputs_list] def run_ov_model(tokens): + ov_model_instance = ov_model.clone() position_ids = None if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: input_shape = tokens["input_ids"].shape position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) - ov_outputs = ov_model(**tokens, position_ids=position_ids) + ov_outputs = ov_model_instance(**tokens, position_ids=position_ids) self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) @@ -601,14 +592,7 @@ def run_ov_model(tokens): # Compare tensor outputs self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) - NUM_THREADS=4 - threads = [] - for i in range(NUM_THREADS): - threads.append(OVThread(target=run_ov_model, args=[tokens_list[i]])) - for thread in threads: - thread.start() - for thread in threads: - thread.join() + run_on_multiple_threads(run_ov_model, args_list=tokens_list) del transformers_model del ov_model @@ -631,6 +615,28 @@ def test_pipeline(self, model_arch): del model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.xfail(reason="use_cache not handled properly during clone()") + def test_pipeline_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False, compile=False) + model.config.encoder_no_repeat_ngram_size = 0 + model.to("cpu") + model.half() + model.compile() + def run_ov_model(input_text): + # Tokenizer is not supposed to be shared by multiple threads + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model.clone(), tokenizer=tokenizer) + outputs = pipe(input_text, max_length=10) + self.assertEqual(pipe.device, model.device) + self.assertTrue(all(input_text in item["generated_text"] for item in outputs)) + del pipe + inputs_list = [["This is a sample"], ["This is a second sample"], ["This is a third sample"]] + run_on_multiple_threads(run_ov_model, args_list=inputs_list) + del model + gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_multiple_inputs(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -647,6 +653,28 @@ def test_multiple_inputs(self, model_arch): del model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_multiple_inputs_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token + texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"] + tokens = tokenizer(texts, padding=True, return_tensors="pt") + generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2) + + def run_ov_model(tokens): + model_instance = model.clone() + outputs = model_instance.generate(**tokens, generation_config=generation_config) + self.assertIsInstance(outputs, torch.Tensor) + self.assertEqual(outputs.shape[0], 3) + + tokens_list = [[tokens], [tokens], [tokens], [tokens]] # running in 4 threads + run_on_multiple_threads(run_ov_model, args_list=tokens_list) + del model + gc.collect() + def test_model_and_decoder_same_device(self): model_id = MODEL_NAMES["gpt2"] model = OVModelForCausalLM.from_pretrained(model_id, export=True) @@ -926,6 +954,48 @@ def test_compare_to_transformers(self, model_arch): gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + # This works the old way - infer request per inference, no cloning + def test_compare_to_transformers_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + ov_model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + + self.assertIsInstance(ov_model.encoder, OVEncoder) + self.assertIsInstance(ov_model.decoder, OVDecoder) + self.assertIsInstance(ov_model.decoder_with_past, OVDecoder) + self.assertIsInstance(ov_model.config, PretrainedConfig) + + transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + + inputs_list = ["This is a sample input for the first thread", + "Input sample for another thread", + "This is a third sample input", + "This last sample is for the last thread"] + args_list = [] + for inputs in inputs_list: + tokens = tokenizer(inputs, return_tensors="pt") + decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2 + decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id} + args_list.append([tokens, decoder_inputs]) + + def run_ov_model(tokens, decoder_inputs): + ov_outputs = ov_model(**tokens, **decoder_inputs) + self.assertTrue("logits" in ov_outputs) + self.assertIsInstance(ov_outputs.logits, torch.Tensor) + with torch.no_grad(): + transformers_outputs = transformers_model(**tokens, **decoder_inputs) + # Compare tensor outputs + self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + + run_on_multiple_threads(run_ov_model, args_list=args_list) + del transformers_model + del ov_model + + gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -959,6 +1029,43 @@ def test_pipeline(self, model_arch): del model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + # This works the old way - infer request per inference, no cloning + def test_pipeline_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, compile=False) + model.half() + model.to("cpu") + model.compile() + + def run_ov_model(text): + # Tokenizer is not supposed to be shared between multiple threads + tokenizer = AutoTokenizer.from_pretrained(model_id) + # Text2Text generation + pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer) + outputs = pipe(text) + self.assertEqual(pipe.device, model.device) + self.assertIsInstance(outputs[0]["generated_text"], str) + + # Summarization + pipe = pipeline("summarization", model=model, tokenizer=tokenizer) + outputs = pipe(text) + self.assertEqual(pipe.device, model.device) + self.assertIsInstance(outputs[0]["summary_text"], str) + + # Translation + pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) + outputs = pipe(text) + self.assertEqual(pipe.device, model.device) + self.assertIsInstance(outputs[0]["translation_text"], str) + del pipe + + texts_list = [["This is a test"], ["This is a test, but for another thread"], ["Yet another test"]] + run_on_multiple_threads(target=run_ov_model, args_list=texts_list) + del model + gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py index 36b1794015..84e94bcae7 100644 --- a/tests/openvino/test_stable_diffusion.py +++ b/tests/openvino/test_stable_diffusion.py @@ -30,7 +30,8 @@ from openvino.runtime.ie_api import CompiledModel from packaging.version import Version, parse from parameterized import parameterized -from utils_tests import MODEL_NAMES, SEED +import pytest +from utils_tests import MODEL_NAMES, SEED, run_on_multiple_threads from optimum.intel import ( OVLatentConsistencyModelPipeline, @@ -251,6 +252,56 @@ def test_compare_to_diffusers(self, model_arch: str): # Compare model devices self.assertEqual(pipeline.device.type, ov_pipeline.device) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.xfail(reason="OVStableDiffusionPipeline does not implement clone()") + def test_compare_to_diffusers_multithreading(self, model_arch: str): + model_id = MODEL_NAMES[model_arch] + ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) + self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) + self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) + self.assertIsInstance(ov_pipeline.unet, OVModelUnet) + self.assertIsInstance(ov_pipeline.config, Dict) + + pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + pipeline.safety_checker = None + batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64 + + def run_ov_model(prompt): + ov_pipeline_instance = ov_pipeline.clone() + latents = ov_pipeline_instance.prepare_latents( + batch_size * num_images_per_prompt, + ov_pipeline_instance.unet.config["in_channels"], + height, + width, + dtype=np.float32, + generator=np.random.RandomState(0), + ) + + kwargs = { + "prompt": prompt, + "num_inference_steps": 1, + "num_images_per_prompt": num_images_per_prompt, + "height": height, + "width": width, + "guidance_rescale": 0.1, + } + + for output_type in ["latent", "np"]: + ov_outputs = ov_pipeline_instance(latents=latents, output_type=output_type, **kwargs).images + self.assertIsInstance(ov_outputs, np.ndarray) + with torch.no_grad(): + outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images + # Compare model outputs + self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) + + # Compare model devices + self.assertEqual(pipeline.device.type, ov_pipeline.device) + + prompt_list = [["sailing ship in storm by Leonardo da Vinci"], ["central park during christmas"], ["zebras in space"]] + run_on_multiple_threads(target=run_ov_model, args_list=prompt_list) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_image_reproducibility(self, model_arch: str): model_id = MODEL_NAMES[model_arch] diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index f8bfc1cc97..1a202c1dd1 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -153,4 +153,15 @@ def run(self): def join(self): super().join() if self.exception: - raise self.exception \ No newline at end of file + raise self.exception + +# Each set of args is run in a separate thread. +# Amount of such sets define how many threads are spawned. +def run_on_multiple_threads(target, args_list): + threads = [] + for args in args_list: + threads.append(OVThread(target=target, args=args)) + for thread in threads: + thread.start() + for thread in threads: + thread.join() \ No newline at end of file From e5d9c755afd3e6d605f34142d44395115c1d184a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 14 Feb 2024 15:08:52 +0100 Subject: [PATCH 10/15] running conncurrent execution of stable diffusion pipe with cloning --- optimum/intel/openvino/modeling_diffusion.py | 61 +++++++++++--------- tests/openvino/gen_img.py | 15 +++-- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 81d0f2bdc8..5e12279d9b 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -94,7 +94,7 @@ def __init__( self._model_save_dir = ( Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir ) - self.vae_decoder = OVModelVaeDecoder(vae_decoder, self) + self.vae_decoder = OVModelVaeDecoder(vae_decoder, self) if vae_decoder is not None else None self.unet = OVModelUnet(unet, self) self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None self.text_encoder_2 = ( @@ -104,13 +104,12 @@ def __init__( ) self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None - if "block_out_channels" in self.vae_decoder.config: - self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1) - else: - self.vae_scale_factor = 8 - - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - + if vae_decoder is not None: + if "block_out_channels" in self.vae_decoder.config: + self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1) + else: + self.vae_scale_factor = 8 + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 self.scheduler = scheduler @@ -270,8 +269,8 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - - return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) + pipe = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) + return pipe @classmethod def _from_transformers( @@ -514,21 +513,29 @@ def _save_config(self, save_directory): def clone(self): self.compile() - pipe_cloned = self.__class__( - self.unet.model, - self._internal_dict, - scheduler=self.scheduler, - vae_decoder=self.vae_decoder.model, - vae_encoder=self.vae_encoder.model, - text_encoder=self.text_encoder.model, -# text_encoder_2=self.text_encoder_2.clone(), - tokenizer=self.tokenizer, - tokenizer_2=self.tokenizer_2, - feature_extractor=self.feature_extractor) + config = self._internal_dict + scheduler = self.scheduler + unet = self.unet.model + model_save_dir = self._model_save_dir + pipe_cloned = self.__class__(unet=unet, + config=config, + scheduler=scheduler, + compile=False, + dynamic_shapes=False, + model_save_dir=model_save_dir) pipe_cloned.unet = self.unet.clone() - pipe_cloned.text_encoder = self.text_encoder.clone() - pipe_cloned.vae_decoder = self.vae_decoder.clone() - pipe_cloned.vae_encoder = self.vae_encoder.clone() + if self.vae_decoder is not None: + pipe_cloned.vae_decoder = self.vae_decoder.clone() + if self.text_encoder is not None: + pipe_cloned.text_encoder= self.text_encoder.clone() + if self.text_encoder_2 is not None: + pipe_cloned.text_encoder_2= self.text_encoder_2.clone() + if self.vae_encoder is not None: + pipe_cloned.vae_encoder= self.vae_encoder.clone() + pipe_cloned.vae_scale_factor = self.vae_scale_factor + pipe_cloned.image_processor = self.image_processor + pipe_cloned.tokenizer = self.tokenizer + pipe_cloned.tokenizer_2 = self.tokenizer_2 return pipe_cloned class OVModelPart: @@ -566,7 +573,7 @@ def _compile(self): ): self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") - logger.info(f"Compiling the {self._model_name} to {self.device} ...") + logger.info(f"Compiling the {self._model_name} to {self.device} with config {self.ov_config} ... ") self.compiled_model = core.compile_model(self.model, self.device, self.ov_config) # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: @@ -643,8 +650,8 @@ def __call__( inputs["timestep_cond"] = timestep_cond self.request.start_async(inputs, share_inputs=True) - self.request.request.wait() - outputs = [self.request.get_tensor(output).data for output in self.results] + self.request.wait() + outputs = [self.request.get_tensor(output).data for output in self.request.results] return outputs diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py index 9f17b7a1e6..1fe61b2239 100644 --- a/tests/openvino/gen_img.py +++ b/tests/openvino/gen_img.py @@ -6,25 +6,23 @@ from optimum.intel.openvino import OVStableDiffusionPipeline -MODEL_PATH = "/home/devuser/model_server/demos/python_demos/stable_diffusion/model" +MODEL_PATH = "/model" OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"} -pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="CPU", ov_config=OV_CONFIG) +pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="GPU", ov_config=OV_CONFIG, compile=True, dynamic_shapes=True) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) vae_decoder_clon = pipe.vae_decoder.clone() -print(vae_decoder_clon, vae_decoder_clon.model) - unet_clon = pipe.unet.clone() -print(unet_clon, unet_clon.model) prompt1 = [" Zebras in space "] prompt2 = [" The statue of liberty in New York", " Big Ben in London "] prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"] + prompts = [prompt1, prompt2, prompt3] -NUM_THREADS = 1 +NUM_THREADS = 3 threads = [None] * NUM_THREADS results = [None] * NUM_THREADS @@ -39,9 +37,10 @@ def save_response(t, p, r): def gen_thread(prompt, results, i): - print("cloning pipe") + start = datetime.datetime.now() pipe_exec = pipe.clone() - print("pipe cloned") + end = datetime.datetime.now() + print("Clonning time [s]", ((end - start).total_seconds())) text = prompt images = pipe_exec(text).images results[i] = images From 34e7e288f11a7d824942d1c63d0e7d0151b3b042 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 16 Feb 2024 16:30:57 +0100 Subject: [PATCH 11/15] add concurrency examples --- examples/openvino/multithreading/README.md | 49 +++++++++++ examples/openvino/multithreading/gen_image.py | 60 ++++++++++++++ .../openvino/multithreading/gen_seq2seq.py | 51 ++++++++++++ examples/openvino/multithreading/gen_text.py | 81 +++++++++++++++++++ .../openvino/multithreading/requirements.txt | 2 + 5 files changed, 243 insertions(+) create mode 100644 examples/openvino/multithreading/README.md create mode 100644 examples/openvino/multithreading/gen_image.py create mode 100644 examples/openvino/multithreading/gen_seq2seq.py create mode 100644 examples/openvino/multithreading/gen_text.py create mode 100644 examples/openvino/multithreading/requirements.txt diff --git a/examples/openvino/multithreading/README.md b/examples/openvino/multithreading/README.md new file mode 100644 index 0000000000..19759a1f30 --- /dev/null +++ b/examples/openvino/multithreading/README.md @@ -0,0 +1,49 @@ +# Execution in multi-threaded environment + +## Overview + +This example demonstrates how to execute the pipelines from Hugging Face transformers with multi concurency. +A typical scenrio is with multi threaded application without duplicating the model allocation in the host memeory. + +By default, the execution of the transformers with OpenVINO Runtime backend is single threaded. Runing the generation process parallel can cause an error +`RuntimeError: Infer Request is busy`. + +A simple technic can overcome this limitation using `clone` method on the model or a pipeline. It duplicates the execution object while sharing the OpenVINO compiled model in the host memory. The clone object should not change the model by reshaping, changing precision and recompiling. +The snippet below applies this concept: + +```python +pipe = OVStableDiffusionPipeline.from_pretrained( + MODEL_PATH, ov_config=OV_CONFIG, compile=True +) +def thread(prompt, results): + pipe_exec = pipe.clone() + images = pipe_exec(prompt).images + # Do something with images + +T1 = threading.Thread(target=thread, args=("my prompt")) +T1.start() +``` +Note that the `clone` operation is quick and is not duplicating the memory usage. It just creates new context for the generating algorithm. + +Check the simple examples how it can be applied in practice. + +## Preparing python environment +```bash +pip install -r examples/openvino/multithreading/requirement.txt +``` + +## Text generation + +```bash +python examples/openvino/multithreading/gen_text.py +``` +## Image generation +```bash +python examples/openvino/multithreading/gen_text.py +``` + +## Text translation with seq2seq + +```bash +python examples/openvino/multithreading/gen_seq2seq.py +``` diff --git a/examples/openvino/multithreading/gen_image.py b/examples/openvino/multithreading/gen_image.py new file mode 100644 index 0000000000..ffa392f26e --- /dev/null +++ b/examples/openvino/multithreading/gen_image.py @@ -0,0 +1,60 @@ +import datetime +import threading + +from optimum.intel.openvino import OVStableDiffusionPipeline + + +MODEL_PATH = "runwayml/stable-diffusion-v1-5" +OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"} + + +pipe = OVStableDiffusionPipeline.from_pretrained( + MODEL_PATH, ov_config=OV_CONFIG, compile=True, dynamic_shapes=True, export=True +) + +vae_decoder_clon = pipe.vae_decoder.clone() +unet_clon = pipe.unet.clone() + +prompt1 = [" Zebras in space "] +prompt2 = [" The statue of liberty in New York", " Big Ben in London "] +prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"] + +prompts = [prompt1, prompt2, prompt3] + +NUM_THREADS = 3 + +threads = [None] * NUM_THREADS +results = [None] * NUM_THREADS + + +def save_response(t, p, r): + print("THREAD", t) + print("PROMPT:", p) + for i in range(len(r)): + print("IMG:", i) + r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG") + +def gen_thread(prompt, results, i): + start = datetime.datetime.now() + pipe_exec = pipe.clone() + end = datetime.datetime.now() + print("Clonning time [s]", ((end - start).total_seconds())) + text = prompt + images = pipe_exec(text).images + results[i] = images + +start = datetime.datetime.now() +for i in range(len(threads)): + threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) + threads[i].start() +nu_img = 0 +for i in range(len(threads)): + threads[i].join() + nu_img += len(results[i]) +end = datetime.datetime.now() + +for i in range(len(threads)): + save_response(i, prompts[i], results[i]) + +print("Generation time [s]", ((end - start).total_seconds()), "images:", nu_img) +print("Throughput:", nu_img * 60 / ((end - start).total_seconds()), "images/min") diff --git a/examples/openvino/multithreading/gen_seq2seq.py b/examples/openvino/multithreading/gen_seq2seq.py new file mode 100644 index 0000000000..27d3ed2a45 --- /dev/null +++ b/examples/openvino/multithreading/gen_seq2seq.py @@ -0,0 +1,51 @@ +import datetime +import threading + +from transformers import AutoTokenizer, pipeline + +from optimum.intel import OVModelForSeq2SeqLM + + +model_id = "echarlaix/t5-small-openvino" +model = OVModelForSeq2SeqLM.from_pretrained(model_id) +tokenizer = AutoTokenizer.from_pretrained(model_id) +pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) + +prompt1 = ["I live in Europe"] +prompt2 = ["What is your name?", "The dog is very happy"] +prompt3 = ["It's a beautiful weather today", "Yes", "Good morning"] +prompts = [prompt1, prompt2, prompt3] + +NUM_THREADS = 3 + +threads = [None] * NUM_THREADS +results = [None] * NUM_THREADS + + +def print_response(t, p, r): + print("THREAD", t) + print("PROMPT:", p) + for i in range(len(r)): + print("TRANSLATION", i, ":", r[i]["translation_text"]) + + +def gen_thread(prompt, results, i): + translations = pipe(prompt) + results[i] = translations + + +start = datetime.datetime.now() +for i in range(len(threads)): + threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) + threads[i].start() +nu_trans = 0 +for i in range(len(threads)): + threads[i].join() + nu_trans += len(results[i]) +end = datetime.datetime.now() + +for i in range(len(threads)): + print_response(i, prompts[i], results[i]) + +print("Generation time [s]", ((end - start).total_seconds()), "translations:", nu_trans) +print("Throughput:", nu_trans / ((end - start).total_seconds()), "translations/s") diff --git a/examples/openvino/multithreading/gen_text.py b/examples/openvino/multithreading/gen_text.py new file mode 100644 index 0000000000..2fed55276e --- /dev/null +++ b/examples/openvino/multithreading/gen_text.py @@ -0,0 +1,81 @@ +import threading +from datetime import datetime +from transformers import AutoConfig, AutoTokenizer, set_seed +from optimum.intel import OVModelForCausalLM + + +set_seed(10) +model_id = "togethercomputer/RedPajama-INCITE-Chat-3B-v1" +tokenizer = AutoTokenizer.from_pretrained(model_id) +tokenizer.pad_token = "[PAD]" +tokenizer.padding_side = "left" +NUM_THREADS = 3 +prompt1 = [": Question: What is the weather like now? Answer: "] +prompt2 = [": Question: What is Openvino?", ": Question: What the the relativity theory? Answer: "] +prompt3 = [ + ": Question: Are cats smarter that dogs? Answer: ", + ": Question: How big is an elephant? Answer: ", + ": Question: The water in the ocean is much hotter than before? Answer: ", +] +prompts = [prompt1, prompt2, prompt3] + +OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": "", "NUM_STREAMS": "2"} +model = OVModelForCausalLM.from_pretrained( + model_id, + config=AutoConfig.from_pretrained(model_id, trust_remote_code=True), + ov_config=OV_CONFIG, + compile=True, + export=True +) + +threads = [None] * NUM_THREADS +results = [None] * NUM_THREADS + + +def print_response(t, p, r): + print("THREAD", t) + print("PROMPT:", p) + for answer in r: + print("Answer:") + print(tokenizer.decode(answer, skip_special_tokens=True)) + + +def gen_thread(prompt, results, i): + inputs = tokenizer(prompt, return_tensors="pt", padding=True) + generate_kwargs = { + "input_ids": inputs.input_ids, + "max_new_tokens": 200, + "temperature": 1.0, + "do_sample": True, + "top_p": 1.0, + "top_k": 50, + "num_beams": 5, + "repetition_penalty": 1.1, + } + start = datetime.now() + model_exec = model.clone() + end = datetime.now() + print("cloning model duration", (end - start).total_seconds() * 1000000, "us") + outputs = model_exec.generate(**generate_kwargs) + num_tok = 0 + for x in range(len(prompt)): + num_tok += outputs[x].numel() - inputs.get("input_ids")[x].numel() + results[i] = outputs, num_tok + + +start = datetime.now() +for i in range(len(threads)): + threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) + threads[i].start() + +total_tok = 0 +for i in range(len(threads)): + threads[i].join() + total_tok += results[i][1] +end = datetime.now() + +for i in range(len(threads)): + print_response(i, prompts[i], results[i][0]) + +print("Generation time [s]", ((end - start).total_seconds()), "tokens:", total_tok) +print("Throughput:", total_tok * 60 / ((end - start).total_seconds()), "tokens/min") diff --git a/examples/openvino/multithreading/requirements.txt b/examples/openvino/multithreading/requirements.txt new file mode 100644 index 0000000000..b05f68f799 --- /dev/null +++ b/examples/openvino/multithreading/requirements.txt @@ -0,0 +1,2 @@ +optimum-intel[openvino, nncf]"@git+https://github.com/huggingface/optimum-intel.git +transformers \ No newline at end of file From f4d21d8fb4e938fc49dd5d33f7c5a64fbb217c9d Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 16 Feb 2024 16:32:43 +0100 Subject: [PATCH 12/15] preserve request attribure as deprecated --- optimum/intel/openvino/modeling.py | 85 ++++++++++---------- optimum/intel/openvino/modeling_base.py | 14 ++-- optimum/intel/openvino/modeling_decoder.py | 17 ++-- optimum/intel/openvino/modeling_diffusion.py | 26 +++--- optimum/intel/openvino/modeling_seq2seq.py | 7 +- 5 files changed, 80 insertions(+), 69 deletions(-) diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 69dc29a6b6..aafd377991 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -132,6 +132,7 @@ def to(self, device: str): if isinstance(device, str): self._device = device.upper() self.compiled_model = None + self.request = None else: logger.warning(f"device must be of type {str} but got {type(device)} instead") @@ -203,14 +204,14 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) logits = ( - torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("logits").data + else self.infer_request.get_tensor("logits").data ) return SequenceClassifierOutput(logits=logits) @@ -278,20 +279,20 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) start_logits = ( - torch.from_numpy(self.request.get_tensor("start_logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("start_logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("start_logits").data + else self.infer_request.get_tensor("start_logits").data ) end_logits = ( - torch.from_numpy(self.request.get_tensor("end_logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("end_logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("end_logits").data + else self.infer_request.get_tensor("end_logits").data ) return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) @@ -358,14 +359,14 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) logits = ( - torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("logits").data + else self.infer_request.get_tensor("logits").data ) return TokenClassifierOutput(logits=logits) @@ -432,14 +433,14 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) last_hidden_state = ( - torch.from_numpy(self.request.get_tensor("last_hidden_state").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("last_hidden_state").data).to(self.device) if not np_inputs - else self.request.get_tensor("last_hidden_state").data + else self.infer_request.get_tensor("last_hidden_state").data ) return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -507,14 +508,14 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) logits = ( - torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("logits").data + else self.infer_request.get_tensor("logits").data ) return MaskedLMOutput(logits=logits) @@ -643,14 +644,14 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) logits = ( - torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("logits").data + else self.infer_request.get_tensor("logits").data ) return ImageClassifierOutput(logits=logits) @@ -717,14 +718,14 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) logits = ( - torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("logits").data + else self.infer_request.get_tensor("logits").data ) return SequenceClassifierOutput(logits=logits) @@ -799,14 +800,14 @@ def forward( # Run inference if self.async_exec: - self.request.start_async(inputs) - self.request.wait() + self.infer_request.start_async(inputs) + self.infer_request.wait() else: - self.request.infer(inputs) + self.infer_request.infer(inputs) logits = ( - torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device) if not np_inputs - else self.request.get_tensor("logits").data + else self.infer_request.get_tensor("logits").data ) return CausalLMOutput(logits=logits) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 000afee021..287803a62d 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -81,7 +81,8 @@ def __init__( output_names[next((name for name in names if "/" not in name), names[0])] = idx self.output_names = output_names self.model = model - self.request = None + self.request = None # Deprecated attribute, use compiled_model instead + self.infer_request = None self.async_exec = False self.compiled_model = None if enable_compilation: @@ -354,14 +355,15 @@ def compile(self): ov_config["CACHE_DIR"] = str(cache_dir) logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}") self.compiled_model = core.compile_model(self.model, self._device, ov_config) + self.request = self.compiled_model # Deprecated attribute, use compiled_model instead # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") _print_compiled_model_properties(self.compiled_model) def create_infer_request(self): - if self.request is None: - self.request = self.compiled_model.create_infer_request() + if self.infer_request is None: + self.infer_request = self.compiled_model.create_infer_request() def _reshape( self, @@ -400,7 +402,8 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False self.model = self._reshape(self.model, batch_size, sequence_length, height, width) self.compiled_model = None - self.request = None + self.infer_request = None + self.request = None # Deprecated attribute, use compiled_model instead return self def half(self): @@ -409,8 +412,9 @@ def half(self): """ apply_moc_transformations(self.model, cf=False) compress_model_transformation(self.model) - self.request = None + self.request = None # Deprecated attribute, use compiled_model instead self.compiled_model = None + self.infer_request = None return self def forward(self, *args, **kwargs): diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index c00f916dbc..fde2cdc7bd 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -206,7 +206,8 @@ def update_pkv_precision(self, force_fp32=False): self.model = self._original_model.clone() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) - self.request = None + self.request = None # Deprecated attribute, use compiled_model instead + self.compiled_model = None def _save_pretrained(self, save_directory: Union[str, Path]): """ @@ -342,8 +343,8 @@ def _make_stateful(self): def create_infer_request(self): if self.compiled_model is None: self.compile() - if self.request is None: - self.request = self.compiled_model.create_infer_request() + if self.infer_request is None: + self.infer_request = self.compiled_model.create_infer_request() @add_start_docstrings( @@ -423,7 +424,7 @@ def prepare_inputs( # past_key_values are not used explicitly, instead they are handled inside the model if past_key_values is None: # This is the first iteration in a sequence, reset all states - self.request.reset_state() + self.infer_request.reset_state() # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used self.next_beam_idx = np.arange(batch_size, dtype=int) @@ -479,9 +480,9 @@ def forward( ) # Run inference - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + self.infer_request.start_async(inputs, share_inputs=True) + self.infer_request.wait() + logits = torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device) if self.stateful: # Need a marker to differentiate the first generate iteration from the others in # the first condition at the function beginning above. @@ -491,7 +492,7 @@ def forward( if not self.stateful: if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + past_key_values = tuple(self.infer_request.get_tensor(key).data for key in self.key_value_output_names) if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 699a745ffa..a6b6d012d3 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -572,7 +572,8 @@ def __init__( } self.ov_config = ov_config or {**self.parent_model.ov_config} self.compiled_model = None - self.request = None + self.request = None # Deprecated attribute, use compiled_model instead + self.infer_request = None self._model_name = model_name self._model_dir = Path(model_dir or parent_model._model_save_dir) config_path = self._model_dir / model_name / self.CONFIG_NAME @@ -589,14 +590,15 @@ def _compile(self): logger.info(f"Compiling the {self._model_name} to {self.device} with config {self.ov_config} ... ") self.compiled_model = core.compile_model(self.model, self.device, self.ov_config) + self.request = self.compiled_model # Deprecated attribute, use compiled_model instead # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self.device} SUPPORTED_PROPERTIES:") _print_compiled_model_properties(self.compiled_model) def create_infer_request(self): - if self.request is None: - self.request = self.compiled_model.create_infer_request() + if self.infer_request is None: + self.infer_request = self.compiled_model.create_infer_request() def clone(self): model_cloned = self.__class__(self.model, self.parent_model, ov_config=self.ov_config) @@ -627,9 +629,9 @@ def __call__(self, input_ids: np.ndarray): inputs = { "input_ids": input_ids, } - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - outputs = [self.request.get_tensor(output).data for output in self.request.results] + self.infer_request.start_async(inputs, share_inputs=True) + self.infer_request.wait() + outputs = [self.infer_request.get_tensor(output).data for output in self.infer_request.results] return outputs @@ -664,9 +666,9 @@ def __call__( if timestep_cond is not None: inputs["timestep_cond"] = timestep_cond - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - outputs = [self.request.get_tensor(output).data for output in self.request.results] + self.infer_request.start_async(inputs, share_inputs=True) + self.infer_request.wait() + outputs = [self.infer_request.get_tensor(output).data for output in self.infer_request.results] return outputs @@ -683,9 +685,9 @@ def __call__(self, latent_sample: np.ndarray): inputs = { "latent_sample": latent_sample, } - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - outputs = [self.request.results[output].data for output in self.request.results] + self.infer_request.start_async(inputs, share_inputs=True) + self.infer_request.wait() + outputs = [self.infer_request.results[output].data for output in self.infer_request.results] return outputs def _compile(self): diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 1c58903d23..98b40866a6 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -419,7 +419,8 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} self.main_input_name = self.parent_model.main_input_name or "input_ids" self.compiled_model = None - self.request = None + self.request = None # Deprecated attribute, use compiled_model instead + self.infer_request = None @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING) def forward( @@ -461,6 +462,7 @@ def _compile(self): if self.compiled_model is None: logger.info(f"Compiling the encoder to {self._device} ...") self.compiled_model = core.compile_model(self.model, self._device, ov_config) + self.request = self.compiled_model # Deprecated attribute, use compiled_model instead # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") @@ -496,8 +498,9 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se self.use_past = False self.num_pkv = 4 - self.request = None + self.request = None # Deprecated attribute, use compiled_model instead self.compiled_model = None + self.infer_request = None @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING) def forward( From c971473692d984e2f0f6519d2aa57820f5f77e30 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 16 Feb 2024 16:50:11 +0100 Subject: [PATCH 13/15] drop not needed tests --- tests/openvino/gen_batch.py | 82 ----------------------------------- tests/openvino/gen_img.py | 63 --------------------------- tests/openvino/gen_seq2seq.py | 51 ---------------------- 3 files changed, 196 deletions(-) delete mode 100644 tests/openvino/gen_batch.py delete mode 100644 tests/openvino/gen_img.py delete mode 100644 tests/openvino/gen_seq2seq.py diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py deleted file mode 100644 index 38ecd6d6b2..0000000000 --- a/tests/openvino/gen_batch.py +++ /dev/null @@ -1,82 +0,0 @@ -import threading -from datetime import datetime - -from transformers import AutoConfig, AutoTokenizer, set_seed - -from optimum.intel import OVModelForCausalLM - - -set_seed(10) -model_path = "/model" -tokenizer = AutoTokenizer.from_pretrained(model_path) -tokenizer.pad_token = "[PAD]" -tokenizer.padding_side = "left" -NUM_THREADS = 3 -prompt1 = [" The weather is "] -prompt2 = [" Openvino is a ", " What the the relativity theory "] -prompt3 = [ - " Are cats smarter that dogs ", - " How big is an elephant ", - " the water in the ocean is much hotter than before ", -] -prompts = [prompt1, prompt2, prompt3] - -OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": "", "NUM_STREAMS": "1"} -model = OVModelForCausalLM.from_pretrained( - model_path, - config=AutoConfig.from_pretrained(model_path, trust_remote_code=True), - ov_config=OV_CONFIG, - compile=True, -) - -threads = [None] * NUM_THREADS -results = [None] * NUM_THREADS - - -def print_response(t, p, r): - print("THREAD", t) - print("PROMPT:", p) - for answer in r: - print("Answer:") - print(tokenizer.decode(answer, skip_special_tokens=True)) - - -def gen_thread(prompt, results, i): - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - generate_kwargs = { - "input_ids": inputs.input_ids, - "max_new_tokens": 200, - "temperature": 1.0, - "do_sample": True, - "top_p": 1.0, - "top_k": 50, - "num_beams": 5, - "repetition_penalty": 1.1, - } - start = datetime.now() - model_exec = model.clone() - end = datetime.now() - print("cloning model duration", (end - start).total_seconds() * 1000000, "us") - outputs = model_exec.generate(**generate_kwargs) - num_tok = 0 - for x in range(len(prompt)): - num_tok += outputs[x].numel() - inputs.get("input_ids")[x].numel() - results[i] = outputs, num_tok - - -start = datetime.now() -for i in range(len(threads)): - threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) - threads[i].start() - -total_tok = 0 -for i in range(len(threads)): - threads[i].join() - total_tok += results[i][1] -end = datetime.now() - -for i in range(len(threads)): - print_response(i, prompts[i], results[i][0]) - -print("Generation time [s]", ((end - start).total_seconds()), "tokens:", total_tok) -print("Throughput:", total_tok * 60 / ((end - start).total_seconds()), "tokens/min") diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py deleted file mode 100644 index 1fe61b2239..0000000000 --- a/tests/openvino/gen_img.py +++ /dev/null @@ -1,63 +0,0 @@ -import datetime -import threading - -from diffusers import DDIMScheduler - -from optimum.intel.openvino import OVStableDiffusionPipeline - - -MODEL_PATH = "/model" -OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"} - - -pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="GPU", ov_config=OV_CONFIG, compile=True, dynamic_shapes=True) -pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - -vae_decoder_clon = pipe.vae_decoder.clone() -unet_clon = pipe.unet.clone() - -prompt1 = [" Zebras in space "] -prompt2 = [" The statue of liberty in New York", " Big Ben in London "] -prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"] - -prompts = [prompt1, prompt2, prompt3] - -NUM_THREADS = 3 - -threads = [None] * NUM_THREADS -results = [None] * NUM_THREADS - - -def save_response(t, p, r): - print("THREAD", t) - print("PROMPT:", p) - for i in range(len(r)): - print("IMG:", i) - r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG") - - -def gen_thread(prompt, results, i): - start = datetime.datetime.now() - pipe_exec = pipe.clone() - end = datetime.datetime.now() - print("Clonning time [s]", ((end - start).total_seconds())) - text = prompt - images = pipe_exec(text).images - results[i] = images - - -start = datetime.datetime.now() -for i in range(len(threads)): - threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) - threads[i].start() -nu_img = 0 -for i in range(len(threads)): - threads[i].join() - nu_img += len(results[i]) -end = datetime.datetime.now() - -for i in range(len(threads)): - save_response(i, prompts[i], results[i]) - -print("Generation time [s]", ((end - start).total_seconds()), "images:", nu_img) -print("Throughput:", nu_img * 60 / ((end - start).total_seconds()), "images/min") diff --git a/tests/openvino/gen_seq2seq.py b/tests/openvino/gen_seq2seq.py deleted file mode 100644 index 27d3ed2a45..0000000000 --- a/tests/openvino/gen_seq2seq.py +++ /dev/null @@ -1,51 +0,0 @@ -import datetime -import threading - -from transformers import AutoTokenizer, pipeline - -from optimum.intel import OVModelForSeq2SeqLM - - -model_id = "echarlaix/t5-small-openvino" -model = OVModelForSeq2SeqLM.from_pretrained(model_id) -tokenizer = AutoTokenizer.from_pretrained(model_id) -pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) - -prompt1 = ["I live in Europe"] -prompt2 = ["What is your name?", "The dog is very happy"] -prompt3 = ["It's a beautiful weather today", "Yes", "Good morning"] -prompts = [prompt1, prompt2, prompt3] - -NUM_THREADS = 3 - -threads = [None] * NUM_THREADS -results = [None] * NUM_THREADS - - -def print_response(t, p, r): - print("THREAD", t) - print("PROMPT:", p) - for i in range(len(r)): - print("TRANSLATION", i, ":", r[i]["translation_text"]) - - -def gen_thread(prompt, results, i): - translations = pipe(prompt) - results[i] = translations - - -start = datetime.datetime.now() -for i in range(len(threads)): - threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) - threads[i].start() -nu_trans = 0 -for i in range(len(threads)): - threads[i].join() - nu_trans += len(results[i]) -end = datetime.datetime.now() - -for i in range(len(threads)): - print_response(i, prompts[i], results[i]) - -print("Generation time [s]", ((end - start).total_seconds()), "translations:", nu_trans) -print("Throughput:", nu_trans / ((end - start).total_seconds()), "translations/s") From cbdb3044a07cbf758057437fbdec75e2f33eeb57 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 16 Feb 2024 17:06:51 +0100 Subject: [PATCH 14/15] style fix --- examples/openvino/multithreading/gen_image.py | 4 +++- examples/openvino/multithreading/gen_text.py | 4 +++- optimum/intel/openvino/modeling_base.py | 6 +++--- tests/openvino/test_stable_diffusion.py | 4 +--- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/openvino/multithreading/gen_image.py b/examples/openvino/multithreading/gen_image.py index ffa392f26e..21f8737445 100644 --- a/examples/openvino/multithreading/gen_image.py +++ b/examples/openvino/multithreading/gen_image.py @@ -9,7 +9,7 @@ pipe = OVStableDiffusionPipeline.from_pretrained( - MODEL_PATH, ov_config=OV_CONFIG, compile=True, dynamic_shapes=True, export=True + MODEL_PATH, ov_config=OV_CONFIG, compile=True, dynamic_shapes=True, export=True ) vae_decoder_clon = pipe.vae_decoder.clone() @@ -34,6 +34,7 @@ def save_response(t, p, r): print("IMG:", i) r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG") + def gen_thread(prompt, results, i): start = datetime.datetime.now() pipe_exec = pipe.clone() @@ -43,6 +44,7 @@ def gen_thread(prompt, results, i): images = pipe_exec(text).images results[i] = images + start = datetime.datetime.now() for i in range(len(threads)): threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i)) diff --git a/examples/openvino/multithreading/gen_text.py b/examples/openvino/multithreading/gen_text.py index 2fed55276e..717f20cfc6 100644 --- a/examples/openvino/multithreading/gen_text.py +++ b/examples/openvino/multithreading/gen_text.py @@ -1,6 +1,8 @@ import threading from datetime import datetime + from transformers import AutoConfig, AutoTokenizer, set_seed + from optimum.intel import OVModelForCausalLM @@ -25,7 +27,7 @@ config=AutoConfig.from_pretrained(model_id, trust_remote_code=True), ov_config=OV_CONFIG, compile=True, - export=True + export=True, ) threads = [None] * NUM_THREADS diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 66737f68f3..0892736e07 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -81,7 +81,7 @@ def __init__( output_names[next((name for name in names if "/" not in name), names[0])] = idx self.output_names = output_names self.model = model - self.request = None # Deprecated attribute, use compiled_model instead + self.request = None # Deprecated attribute, use compiled_model instead self.infer_request = None self.async_exec = False self.compiled_model = None @@ -349,7 +349,7 @@ def compile(self): ov_config["CACHE_DIR"] = str(cache_dir) logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}") self.compiled_model = core.compile_model(self.model, self._device, ov_config) - self.request = self.compiled_model # Deprecated attribute, use compiled_model instead + self.request = self.compiled_model # Deprecated attribute, use compiled_model instead # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") @@ -396,7 +396,7 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False self.model = self._reshape(self.model, batch_size, sequence_length, height, width) self.compiled_model = None - self.infer_request = None + self.infer_request = None self.request = None # Deprecated attribute, use compiled_model instead return self diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py index c808397312..65044339f2 100644 --- a/tests/openvino/test_stable_diffusion.py +++ b/tests/openvino/test_stable_diffusion.py @@ -271,9 +271,7 @@ def test_compare_to_diffusers_multithreading(self, model_arch: str): pipeline.safety_checker = None batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64 - def run_ov_model( - prompt, ov_pipeline - ): + def run_ov_model(prompt, ov_pipeline): ov_pipeline_instance = ov_pipeline.clone() latents = ov_pipeline_instance.prepare_latents( batch_size * num_images_per_prompt, From 30437ae38d2d527b46b980c3f09be884646704f9 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 20 Feb 2024 12:29:50 +0100 Subject: [PATCH 15/15] fix tests without gpu --- tests/openvino/test_modeling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 40027b2f18..84e33d7f7b 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -576,6 +576,8 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers_multithreading(self, model_arch): model_id = MODEL_NAMES[model_arch] + if "llama_gptq" in model_arch: + self.skipTest("Not supported without gpu and disable_exllama=True option") set_seed(SEED) ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) self.assertIsInstance(ov_model.config, PretrainedConfig)