From 9b551006ed5a967e01fc3e45b3d500796ffcf855 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Wed, 17 Jan 2024 19:09:54 +0100
Subject: [PATCH 01/15] support for concurrency in llm models

---
 optimum/intel/openvino/modeling.py         | 68 ++++++++++++--------
 optimum/intel/openvino/modeling_base.py    |  6 +-
 optimum/intel/openvino/modeling_decoder.py | 42 ++++++++-----
 tests/openvino/gen_batch.py                | 72 ++++++++++++++++++++++
 tests/openvino/test_modeling.py            | 18 +++---
 5 files changed, 157 insertions(+), 49 deletions(-)
 create mode 100644 tests/openvino/gen_batch.py

diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index f6d3061a7a..881dcd2b9e 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -130,6 +130,7 @@ def to(self, device: str):
         be in upper or lower case. To speed up first inference, call `.compile()` after `.to()`.
         """
         self._device = device.upper()
+        self.compiled_model = None
         self.request = None
         return self
 
@@ -197,8 +198,10 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
         return SequenceClassifierOutput(logits=logits)
 
 
@@ -263,12 +266,14 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
         start_logits = (
-            torch.from_numpy(outputs["start_logits"]).to(self.device) if not np_inputs else outputs["start_logits"]
+            torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("start_logits").data
         )
         end_logits = (
-            torch.from_numpy(outputs["end_logits"]).to(self.device) if not np_inputs else outputs["end_logits"]
+            torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("end_logits").data
         )
         return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits)
 
@@ -333,8 +338,10 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
         return TokenClassifierOutput(logits=logits)
 
 
@@ -398,11 +405,13 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
         last_hidden_state = (
-            torch.from_numpy(outputs["last_hidden_state"]).to(self.device)
+            torch.from_numpy(infer_request.get_tensor("last_hidden_state").data).to(self.device)
             if not np_inputs
-            else outputs["last_hidden_state"]
+            else infer_request.get_tensor("last_hidden_state").data
         )
         return BaseModelOutput(last_hidden_state=last_hidden_state)
 
@@ -468,8 +477,10 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
         return MaskedLMOutput(logits=logits)
 
 
@@ -595,8 +606,10 @@ def forward(
         }
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
         return ImageClassifierOutput(logits=logits)
 
 
@@ -660,8 +673,10 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
         return SequenceClassifierOutput(logits=logits)
 
 
@@ -732,8 +747,10 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
         return CausalLMOutput(logits=logits)
 
 
@@ -813,12 +830,13 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
         embeddings = (
-            torch.from_numpy(outputs["embeddings"]).to(self.device) if not np_inputs else outputs["embeddings"]
+            torch.from_numpy(infer_request.get_tensor("embeddings").data).to(self.device) if not np_inputs else infer_request.get_tensor("embeddings").data
         )
-
         return XVectorOutput(logits=logits, embeddings=embeddings)
 
 
@@ -890,7 +908,9 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
 
         return TokenClassifierOutput(logits=logits)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 05dc3af9b5..deff686e38 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -89,6 +89,7 @@ def __init__(
 
         self.model = model
         self.request = None
+        self.compiled_model = None
         if enable_compilation:
             self.compile()
 
@@ -343,7 +344,7 @@ def _to_load(
         )
 
     def compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             logger.info(f"Compiling the model to {self._device} ...")
             ov_config = {**self.ov_config}
             if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
@@ -351,7 +352,7 @@ def compile(self):
                 cache_dir = Path(self.model_save_dir).joinpath("model_cache")
                 ov_config["CACHE_DIR"] = str(cache_dir)
                 logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}")
-            self.request = core.compile_model(self.model, self._device, ov_config)
+            self.compiled_model = core.compile_model(self.model, self._device, ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
@@ -403,6 +404,7 @@ def half(self):
         apply_moc_transformations(self.model, cf=False)
         compress_model_transformation(self.model)
         self.request = None
+        self.compiled_model = None
         return self
 
     def forward(self, *args, **kwargs):
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 8a2167eae4..b2fb2ed593 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -135,21 +135,13 @@ def __init__(
         self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
         self.key_value_input_names = [key for key in self.input_names if "key_values" in key]
         self.key_value_output_names = [key for key in self.output_names if "present" in key]
-        self._original_model = self.model.clone()  # keep original model for serialization
-        self._pkv_precision = Type.f32
-        self.next_beam_idx = None
-        self.update_pkv_precision()
-        if self.is_dynamic:
-            self.model = self._reshape(self.model, -1, -1)
         is_stateful_supported = ensure_stateful_is_available(warn=False)
-
         if self.use_cache and not self.stateful:
             logger.warn(
                 "Provided model does not contain state. It may lead to sub-optimal performance."
                 "Please reexport model with updated OpenVINO version >= 2023.3.0 calling the `from_pretrained` method with original model "
                 "and `export=True` parameter"
             )
-
         if self.stateful:
             if stateful is None:
                 stateful = is_stateful_supported
@@ -176,7 +168,13 @@ def raise_error(model_prop, user_prop, name):
         if use_cache ^ self.use_cache:
             raise_error(self.use_cache, use_cache, "use_cache")
 
-        if enable_compilation:
+    def init_ov_model(self, compile=True):
+        self._pkv_precision = Type.f32
+        self.update_pkv_precision(force_fp32=False)
+        if self.is_dynamic:
+            self.model = self._reshape(self.model, -1, -1)
+        self._original_model = self.model.clone()  # keep original model for serialization
+        if compile:
             self.compile()
 
     def update_pkv_precision(self, force_fp32=False):
@@ -282,9 +280,10 @@ def _from_transformers(
         config.is_decoder = True
         config.is_encoder_decoder = False
         config.save_pretrained(save_dir_path)
-        return cls._from_pretrained(
+        model_instance = cls._from_pretrained(
             model_id=save_dir_path, config=config, use_cache=use_cache, load_in_8bit=False, stateful=None, **kwargs
         )
+        return model_instance
 
     def _reshape(
         self,
@@ -322,14 +321,18 @@ def reshape(self, batch_size: int, sequence_length: int):
         return self
 
     def compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             super().compile()
-            self.request = self.request.create_infer_request()
 
     def _make_stateful(self):
         patch_stateful(self.config, self.model)
         self.stateful = True
 
+    def create_infer_request(self):
+        if self.compiled_model is None:
+            self.compile()
+        if self.request is None:
+            self.request = self.compiled_model.create_infer_request()
 
 @add_start_docstrings(
     """
@@ -359,6 +362,8 @@ def forward(
         **kwargs,
     ) -> CausalLMOutputWithPast:
         self.compile()
+        self.create_infer_request()
+
         if self.use_cache and past_key_values is not None:
             input_ids = input_ids[:, -1:]
 
@@ -556,8 +561,17 @@ def _from_pretrained(
         else:
             init_cls = cls
 
-        return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
-
+        model_instance = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
+        model_instance.init_ov_model(compile=kwargs.get("compile", True))
+        model_instance.request = None
+        return model_instance
+
+    def clone(self):
+        model_instance = self.__class__(model=self.model, config=self.config, compile=False)
+        model_instance.compiled_model = self.compiled_model
+        model_instance._pkv_precision = self._pkv_precision
+        model_instance.request = None
+        return model_instance
 
 class OVBloomForCausalLM(OVModelForCausalLM):
     # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation
diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py
new file mode 100644
index 0000000000..282fd09921
--- /dev/null
+++ b/tests/openvino/gen_batch.py
@@ -0,0 +1,72 @@
+from optimum.intel import OVModelForCausalLM
+from transformers import AutoTokenizer, AutoConfig, set_seed
+import threading
+from datetime import datetime 
+
+set_seed(10)
+model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf-stateful/pytorch/dldt/compressed_weights/OV_FP16-INT8/"
+#model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf/pytorch/dldt/compressed_weights/OV_FP16-INT8/"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+tokenizer.pad_token = "[PAD]"
+tokenizer.padding_side = "left"
+
+prompt1 = [" The weather is "]
+prompt2 = [" Openvino is a " , " What the the relativity theory "]
+prompt3 = [" Are cats smarter that dogs ", " How big is an elephant ", " the water in the ocean is much hotter than before  "]
+prompts = [prompt1, prompt2, prompt3]
+OV_CONFIG = {'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '','NUM_STREAMS': '1'}
+model = OVModelForCausalLM.from_pretrained(model_path, config=AutoConfig.from_pretrained(model_path, trust_remote_code=True),ov_config=OV_CONFIG, compile=True)
+
+NUM_THREADS = 3
+
+threads = [None]* NUM_THREADS
+results = [None]* NUM_THREADS
+
+def print_response(t,p,r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for answer in r:
+        print("Answer:")
+        print(tokenizer.decode(answer, skip_special_tokens=True))
+
+def gen_thread(prompt, results, i):
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
+    generate_kwargs = dict(
+            input_ids=inputs.input_ids,
+            max_new_tokens=500,
+            temperature=1.0,
+            do_sample=True,
+            top_p=1.0,
+            top_k=50,
+            num_beams=5,
+            repetition_penalty=1.1
+             )
+    start = datetime.now()
+    model_exec = model.clone()
+    end = datetime.now()
+    print("cloning model duration", (end - start).total_seconds()*1000000, "us")
+    outputs = model_exec.generate(**generate_kwargs)
+    num_tok = 0
+    for i in range(len(prompt)):
+        num_tok += outputs[i].numel() - inputs.get("input_ids")[i].numel()
+    results[i] = outputs, num_tok
+
+start = datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i],results, i))
+    threads[i].start()
+
+total_tok = 0
+for i in range(len(threads)):
+    threads[i].join()
+    total_tok += results[i][1]
+end = datetime.now()
+
+for i in range(len(threads)):
+    print_response(i,prompts[i],results[i][0])
+
+print("Generation time [s]", ((end-start).total_seconds()), "tokens:", total_tok)
+print("Throughput:", total_tok * 60 / ((end-start).total_seconds()), "tokens/min" )
+
+
+
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index e51b50a5b2..2f9143101c 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -129,9 +129,9 @@ def test_load_from_hub_and_save_model(self):
         self.assertTrue(manual_openvino_cache_dir.is_dir())
         self.assertGreaterEqual(len(list(manual_openvino_cache_dir.glob("*.blob"))), 1)
         if is_openvino_version("<", "2023.3"):
-            self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT").name, "THROUGHPUT")
+            self.assertEqual(loaded_model.compiled_model.get_property("PERFORMANCE_HINT").name, "THROUGHPUT")
         else:
-            self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT"), "THROUGHPUT")
+            self.assertEqual(loaded_model.compiled_model.get_property("PERFORMANCE_HINT"), "THROUGHPUT")
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             loaded_model.save_pretrained(tmpdirname)
@@ -748,7 +748,7 @@ def test_pipeline(self, model_arch):
     @parameterized.expand(TIMM_MODELS)
     def test_compare_to_timm(self, model_id):
         ov_model = OVModelForImageClassification.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        self.assertEqual(ov_model.request.get_property("INFERENCE_PRECISION_HINT").to_string(), "f32")
+        self.assertEqual(ov_model.compiled_model.get_property("INFERENCE_PRECISION_HINT").to_string(), "f32")
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         timm_model = timm.create_model(model_id, pretrained=True)
         preprocessor = TimmImageProcessor.from_pretrained(model_id)
@@ -886,20 +886,21 @@ def test_compare_with_and_without_past_key_values(self):
         text = "This is a sample input"
         tokens = tokenizer(text, return_tensors="pt")
 
-        model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=True)
+        model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG)
         _ = model_with_pkv.generate(**tokens)  # warmup
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
                 **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
 
-        model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=False)
+        model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG)
         _ = model_without_pkv.generate(**tokens)  # warmup
         with Timer() as without_pkv_timer:
             outputs_model_without_pkv = model_without_pkv.generate(
                 **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
-
+        print(outputs_model_with_pkv)
+        print(outputs_model_without_pkv)    
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
@@ -1201,20 +1202,19 @@ def test_compare_with_and_without_past_key_values(self):
         question = "Who am I?"
         inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt")
 
-        model_with_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=True)
+        model_with_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG)
         _ = model_with_pkv.generate(**inputs)  # warmup
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
                 **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
 
-        model_without_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=False)
+        model_without_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG)
         _ = model_without_pkv.generate(**inputs)  # warmup
         with Timer() as without_pkv_timer:
             outputs_model_without_pkv = model_without_pkv.generate(
                 **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
-
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)

From 9e4ab17d3b383c9f96fee5c85ee695096a85424f Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Thu, 18 Jan 2024 08:38:38 +0100
Subject: [PATCH 02/15] style fixes

---
 optimum/intel/openvino/modeling.py         | 60 +++++++++++++++----
 optimum/intel/openvino/modeling_decoder.py |  3 +-
 optimum/intel/openvino/quantization.py     |  2 +
 tests/openvino/gen_batch.py                | 70 +++++++++++++---------
 tests/openvino/test_modeling.py            | 18 ++++--
 5 files changed, 107 insertions(+), 46 deletions(-)

diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index 881dcd2b9e..fd3751de01 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -201,7 +201,11 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         return SequenceClassifierOutput(logits=logits)
 
 
@@ -270,10 +274,14 @@ def forward(
         infer_request.start_async(inputs)
         infer_request.wait()
         start_logits = (
-            torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("start_logits").data
+            torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("start_logits").data
         )
         end_logits = (
-            torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("end_logits").data
+            torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("end_logits").data
         )
         return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits)
 
@@ -341,7 +349,11 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         return TokenClassifierOutput(logits=logits)
 
 
@@ -480,7 +492,11 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         return MaskedLMOutput(logits=logits)
 
 
@@ -609,7 +625,11 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         return ImageClassifierOutput(logits=logits)
 
 
@@ -676,7 +696,11 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         return SequenceClassifierOutput(logits=logits)
 
 
@@ -750,7 +774,11 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         return CausalLMOutput(logits=logits)
 
 
@@ -833,9 +861,15 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         embeddings = (
-            torch.from_numpy(infer_request.get_tensor("embeddings").data).to(self.device) if not np_inputs else infer_request.get_tensor("embeddings").data
+            torch.from_numpy(infer_request.get_tensor("embeddings").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("embeddings").data
         )
         return XVectorOutput(logits=logits, embeddings=embeddings)
 
@@ -911,6 +945,10 @@ def forward(
         infer_request = self.compiled_model.create_infer_request()
         infer_request.start_async(inputs)
         infer_request.wait()
-        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if not np_inputs else infer_request.get_tensor("logits").data
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
 
         return TokenClassifierOutput(logits=logits)
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index b2fb2ed593..1bd57042c8 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -111,7 +111,6 @@ def __init__(
                 "`dynamic_shapes` was set to `False` but static shapes are not supported for causal language model. Please set `dynamic_shapes=True`."
             )
 
-        enable_compilation = kwargs.get("compile", True)
         kwargs["compile"] = False  # avoid extra compilation in the base class
 
         super().__init__(
@@ -334,6 +333,7 @@ def create_infer_request(self):
         if self.request is None:
             self.request = self.compiled_model.create_infer_request()
 
+
 @add_start_docstrings(
     """
     OpenVINO Model with a causal language modeling head on top (linear layer with weights tied to the input
@@ -573,6 +573,7 @@ def clone(self):
         model_instance.request = None
         return model_instance
 
+
 class OVBloomForCausalLM(OVModelForCausalLM):
     # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 3da486c45c..fd517b3c88 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -294,6 +294,8 @@ def _quantize_ovcausallm(
         # Prefeth past_key_values
         self.model.update_pkv_precision(True)
         self.model.compile()
+        self.model.create_infer_request()
+
         subset_size = kwargs.get("subset_size", 300)
         data_cache = []
 
diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py
index 282fd09921..5b92ad3b06 100644
--- a/tests/openvino/gen_batch.py
+++ b/tests/openvino/gen_batch.py
@@ -1,59 +1,74 @@
-from optimum.intel import OVModelForCausalLM
-from transformers import AutoTokenizer, AutoConfig, set_seed
 import threading
-from datetime import datetime 
+from datetime import datetime
+
+from transformers import AutoConfig, AutoTokenizer, set_seed
+
+from optimum.intel import OVModelForCausalLM
+
 
 set_seed(10)
 model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf-stateful/pytorch/dldt/compressed_weights/OV_FP16-INT8/"
-#model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf/pytorch/dldt/compressed_weights/OV_FP16-INT8/"
+# model_path = "/home/devuser/openvino.genai/llm_bench/python/llama-2-7b-chat-hf/pytorch/dldt/compressed_weights/OV_FP16-INT8/"
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 tokenizer.pad_token = "[PAD]"
 tokenizer.padding_side = "left"
 
 prompt1 = [" The weather is "]
-prompt2 = [" Openvino is a " , " What the the relativity theory "]
-prompt3 = [" Are cats smarter that dogs ", " How big is an elephant ", " the water in the ocean is much hotter than before  "]
+prompt2 = [" Openvino is a ", " What the the relativity theory "]
+prompt3 = [
+    " Are cats smarter that dogs ",
+    " How big is an elephant ",
+    " the water in the ocean is much hotter than before  ",
+]
 prompts = [prompt1, prompt2, prompt3]
-OV_CONFIG = {'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '','NUM_STREAMS': '1'}
-model = OVModelForCausalLM.from_pretrained(model_path, config=AutoConfig.from_pretrained(model_path, trust_remote_code=True),ov_config=OV_CONFIG, compile=True)
+OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": "", "NUM_STREAMS": "1"}
+model = OVModelForCausalLM.from_pretrained(
+    model_path,
+    config=AutoConfig.from_pretrained(model_path, trust_remote_code=True),
+    ov_config=OV_CONFIG,
+    compile=True,
+)
 
 NUM_THREADS = 3
 
-threads = [None]* NUM_THREADS
-results = [None]* NUM_THREADS
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
 
-def print_response(t,p,r):
+def print_response(t, p, r):
     print("THREAD", t)
     print("PROMPT:", p)
     for answer in r:
         print("Answer:")
         print(tokenizer.decode(answer, skip_special_tokens=True))
 
+
 def gen_thread(prompt, results, i):
     inputs = tokenizer(prompt, return_tensors="pt", padding=True)
-    generate_kwargs = dict(
-            input_ids=inputs.input_ids,
-            max_new_tokens=500,
-            temperature=1.0,
-            do_sample=True,
-            top_p=1.0,
-            top_k=50,
-            num_beams=5,
-            repetition_penalty=1.1
-             )
+    generate_kwargs = {
+        "input_ids": inputs.input_ids,
+        "max_new_tokens": 200,
+        "temperature": 1.0,
+        "do_sample": True,
+        "top_p": 1.0,
+        "top_k": 50,
+        "num_beams" :5,
+        "repetition_penalty": 1.1
+    }
     start = datetime.now()
     model_exec = model.clone()
     end = datetime.now()
-    print("cloning model duration", (end - start).total_seconds()*1000000, "us")
+    print("cloning model duration", (end - start).total_seconds() * 1000000, "us")
     outputs = model_exec.generate(**generate_kwargs)
     num_tok = 0
     for i in range(len(prompt)):
         num_tok += outputs[i].numel() - inputs.get("input_ids")[i].numel()
     results[i] = outputs, num_tok
 
+
 start = datetime.now()
 for i in range(len(threads)):
-    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i],results, i))
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
     threads[i].start()
 
 total_tok = 0
@@ -63,10 +78,7 @@ def gen_thread(prompt, results, i):
 end = datetime.now()
 
 for i in range(len(threads)):
-    print_response(i,prompts[i],results[i][0])
-
-print("Generation time [s]", ((end-start).total_seconds()), "tokens:", total_tok)
-print("Throughput:", total_tok * 60 / ((end-start).total_seconds()), "tokens/min" )
-
-
+    print_response(i, prompts[i], results[i][0])
 
+print("Generation time [s]", ((end - start).total_seconds()), "tokens:", total_tok)
+print("Throughput:", total_tok * 60 / ((end - start).total_seconds()), "tokens/min")
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 2f9143101c..c1746f5538 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -886,21 +886,25 @@ def test_compare_with_and_without_past_key_values(self):
         text = "This is a sample input"
         tokens = tokenizer(text, return_tensors="pt")
 
-        model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG)
+        model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(
+            model_id, export=True, use_cache=True, ov_config=F32_CONFIG
+        )
         _ = model_with_pkv.generate(**tokens)  # warmup
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
                 **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
 
-        model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG)
+        model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(
+            model_id, export=True, use_cache=False, ov_config=F32_CONFIG
+        )
         _ = model_without_pkv.generate(**tokens)  # warmup
         with Timer() as without_pkv_timer:
             outputs_model_without_pkv = model_without_pkv.generate(
                 **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
         print(outputs_model_with_pkv)
-        print(outputs_model_without_pkv)    
+        print(outputs_model_without_pkv)
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
@@ -1202,14 +1206,18 @@ def test_compare_with_and_without_past_key_values(self):
         question = "Who am I?"
         inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt")
 
-        model_with_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=True, ov_config=F32_CONFIG)
+        model_with_pkv = OVModelForPix2Struct.from_pretrained(
+            model_id, export=True, use_cache=True, ov_config=F32_CONFIG
+        )
         _ = model_with_pkv.generate(**inputs)  # warmup
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
                 **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
 
-        model_without_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=False, ov_config=F32_CONFIG)
+        model_without_pkv = OVModelForPix2Struct.from_pretrained(
+            model_id, export=True, use_cache=False, ov_config=F32_CONFIG
+        )
         _ = model_without_pkv.generate(**inputs)  # warmup
         with Timer() as without_pkv_timer:
             outputs_model_without_pkv = model_without_pkv.generate(

From dcb2a8fddc2833fdd9f82989ab5554aac787f012 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Fri, 19 Jan 2024 12:01:11 +0100
Subject: [PATCH 03/15] concurrency in seq2seq and stable diffusion classes

---
 optimum/intel/openvino/modeling_diffusion.py | 36 ++++++++----
 optimum/intel/openvino/modeling_seq2seq.py   | 31 +++++-----
 tests/openvino/gen_batch.py                  |  4 +-
 tests/openvino/gen_img.py                    | 59 ++++++++++++++++++++
 tests/openvino/gen_seq2seq.py                | 51 +++++++++++++++++
 tests/openvino/test_stable_diffusion.py      | 15 +++--
 6 files changed, 161 insertions(+), 35 deletions(-)
 create mode 100644 tests/openvino/gen_img.py
 create mode 100644 tests/openvino/gen_seq2seq.py

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index fa48a5df68..8454acc3d9 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -532,7 +532,7 @@ def __init__(
             for inputs in self.model.inputs
         }
         self.ov_config = ov_config or {**self.parent_model.ov_config}
-        self.request = None
+        self.compiled_model = None
         self._model_name = model_name
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
         config_path = self._model_dir / model_name / self.CONFIG_NAME
@@ -541,13 +541,13 @@ def __init__(
             self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")
 
     def _compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             logger.info(f"Compiling the {self._model_name} to {self.device} ...")
-            self.request = core.compile_model(self.model, self.device, self.ov_config)
+            self.compiled_model = core.compile_model(self.model, self.device, self.ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self.device} SUPPORTED_PROPERTIES:")
-                _print_compiled_model_properties(self.request)
+                _print_compiled_model_properties(self.compiled_model)
 
     @property
     def device(self):
@@ -570,8 +570,11 @@ def __call__(self, input_ids: np.ndarray):
         inputs = {
             "input_ids": input_ids,
         }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        outputs = [infer_request.get_tensor(output).data for output in infer_request.results]
+        return outputs
 
 
 class OVModelUnet(OVModelPart):
@@ -604,8 +607,11 @@ def __call__(
         if timestep_cond is not None:
             inputs["timestep_cond"] = timestep_cond
 
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        outputs = [infer_request.get_tensor(output).data for output in infer_request.results]
+        return outputs
 
 
 class OVModelVaeDecoder(OVModelPart):
@@ -620,8 +626,11 @@ def __call__(self, latent_sample: np.ndarray):
         inputs = {
             "latent_sample": latent_sample,
         }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        outputs = [infer_request.results[output].data for output in infer_request.results]
+        return outputs
 
     def _compile(self):
         if "GPU" in self.device:
@@ -641,8 +650,11 @@ def __call__(self, sample: np.ndarray):
         inputs = {
             "sample": sample,
         }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        outputs = [infer_request.get_tensor(output).data for output in infer_request.results]
+        return outputs
 
     def _compile(self):
         if "GPU" in self.device:
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 9a7f913ab2..db05e0acc9 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -442,7 +442,7 @@ def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict,
         self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
         self.main_input_name = main_input_name
         self.ov_config = ov_config
-        self.request = None
+        self.compiled_model = None
 
     @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING)
     def forward(
@@ -461,9 +461,10 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        last_hidden_state = torch.from_numpy(
-            self.request(inputs, share_inputs=True, share_outputs=True)["last_hidden_state"]
-        ).to(self.device)
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        last_hidden_state = torch.from_numpy(infer_request.get_tensor("last_hidden_state").data).to(self.device)
 
         return BaseModelOutput(last_hidden_state=last_hidden_state)
 
@@ -471,9 +472,9 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
     def _compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             logger.info(f"Compiling the encoder to {self._device} ...")
-            self.request = core.compile_model(self.model, self._device, self.ov_config)
+            self.compiled_model = core.compile_model(self.model, self._device, self.ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
@@ -509,7 +510,7 @@ def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict):
             self.num_pkv = 4
 
         self.ov_config = ov_config
-        self.request = None
+        self.compiled_model = None
 
     @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING)
     def forward(
@@ -546,13 +547,14 @@ def forward(
         if "decoder_attention_mask" in self.input_names and decoder_attention_mask is not None:
             inputs["decoder_attention_mask"] = decoder_attention_mask
         # Run inference
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
 
         # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the
         # self-attention layer and 2 to the cross-attention layer)
-        out_past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
+        out_past_key_values = tuple(infer_request.get_tensor(key).data for key in self.key_value_output_names)
 
         # Tuple of tuple of length `n_layers`, with each tuple of length equal to:
         # * 4 for the decoder without cache (k/v of self-attention + k/v of cross-attention)
@@ -574,14 +576,13 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
     def _compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             logger.info(f"Compiling the decoder to {self._device} ...")
-            compiled_model = core.compile_model(self.model, self._device, self.ov_config)
-            self.request = compiled_model.create_infer_request()
+            self.compiled_model = core.compile_model(self.model, self._device, self.ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
-                _print_compiled_model_properties(compiled_model)
+                _print_compiled_model_properties(self.compiled_model)
 
 
 @add_start_docstrings(
diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py
index 5b92ad3b06..5d165217ec 100644
--- a/tests/openvino/gen_batch.py
+++ b/tests/openvino/gen_batch.py
@@ -52,8 +52,8 @@ def gen_thread(prompt, results, i):
         "do_sample": True,
         "top_p": 1.0,
         "top_k": 50,
-        "num_beams" :5,
-        "repetition_penalty": 1.1
+        "num_beams": 5,
+        "repetition_penalty": 1.1,
     }
     start = datetime.now()
     model_exec = model.clone()
diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py
new file mode 100644
index 0000000000..72a6c97439
--- /dev/null
+++ b/tests/openvino/gen_img.py
@@ -0,0 +1,59 @@
+import datetime
+import threading
+
+from diffusers import DDIMScheduler
+
+from optimum.intel.openvino import OVStableDiffusionPipeline
+
+
+MODEL_PATH = "/home/devuser/model_server/demos/python_demos/stable_diffusion/model"
+OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"}
+
+
+pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="CPU", ov_config=OV_CONFIG)
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+
+
+# set_seed(10)
+
+
+prompt1 = [" Zebras in space "]
+prompt2 = [" The statue of liberty in New York", " Big Ben in London "]
+prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"]
+prompts = [prompt1, prompt2, prompt3]
+
+NUM_THREADS = 3
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def save_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for i in range(len(r)):
+        print("IMG:", i)
+        r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG")
+
+
+def gen_thread(prompt, results, i):
+    text = prompt
+    images = pipe(text).images
+    results[i] = images
+
+
+start = datetime.datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+nu_img = 0
+for i in range(len(threads)):
+    threads[i].join()
+    nu_img += len(results[i])
+end = datetime.datetime.now()
+
+for i in range(len(threads)):
+    save_response(i, prompts[i], results[i])
+
+print("Generation time [s]", ((end - start).total_seconds()), "images:", nu_img)
+print("Throughput:", nu_img * 60 / ((end - start).total_seconds()), "images/min")
diff --git a/tests/openvino/gen_seq2seq.py b/tests/openvino/gen_seq2seq.py
new file mode 100644
index 0000000000..27d3ed2a45
--- /dev/null
+++ b/tests/openvino/gen_seq2seq.py
@@ -0,0 +1,51 @@
+import datetime
+import threading
+
+from transformers import AutoTokenizer, pipeline
+
+from optimum.intel import OVModelForSeq2SeqLM
+
+
+model_id = "echarlaix/t5-small-openvino"
+model = OVModelForSeq2SeqLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
+
+prompt1 = ["I live in Europe"]
+prompt2 = ["What is your name?", "The dog is very happy"]
+prompt3 = ["It's a beautiful weather today", "Yes", "Good morning"]
+prompts = [prompt1, prompt2, prompt3]
+
+NUM_THREADS = 3
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def print_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for i in range(len(r)):
+        print("TRANSLATION", i, ":", r[i]["translation_text"])
+
+
+def gen_thread(prompt, results, i):
+    translations = pipe(prompt)
+    results[i] = translations
+
+
+start = datetime.datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+nu_trans = 0
+for i in range(len(threads)):
+    threads[i].join()
+    nu_trans += len(results[i])
+end = datetime.datetime.now()
+
+for i in range(len(threads)):
+    print_response(i, prompts[i], results[i])
+
+print("Generation time [s]", ((end - start).total_seconds()), "translations:", nu_trans)
+print("Throughput:", nu_trans / ((end - start).total_seconds()), "translations/s")
diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py
index 1fce7c3fc8..2dc56fbe4b 100644
--- a/tests/openvino/test_stable_diffusion.py
+++ b/tests/openvino/test_stable_diffusion.py
@@ -55,6 +55,9 @@
 from optimum.utils.import_utils import _diffusers_version
 
 
+F32_CONFIG = {"CACHE_DIR": "", "INFERENCE_PRECISION_HINT": "f32"}
+
+
 def _generate_inputs(batch_size=1):
     inputs = {
         "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
@@ -208,7 +211,7 @@ class OVStableDiffusionPipelineTest(unittest.TestCase):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_diffusers(self, model_arch: str):
         model_id = MODEL_NAMES[model_arch]
-        ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True)
+        ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
         self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
         self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
         self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
@@ -361,7 +364,7 @@ class OVtableDiffusionXLPipelineTest(unittest.TestCase):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_diffusers(self, model_arch: str):
-        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG)
         self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
         self.assertIsInstance(ov_pipeline.text_encoder_2, OVModelTextEncoder)
         self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
@@ -407,7 +410,7 @@ def test_image_reproducibility(self, model_arch: str):
 
         # Verify every subcomponent is compiled by default
         for component in {"unet", "vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"}:
-            self.assertIsInstance(getattr(pipeline, component).request, CompiledModel)
+            self.assertIsInstance(getattr(pipeline, component).compiled_model, CompiledModel)
 
         batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128
         inputs = _generate_inputs(batch_size)
@@ -447,11 +450,11 @@ class OVStableDiffusionXLImg2ImgPipelineTest(unittest.TestCase):
 
     def test_inference(self):
         model_id = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id)
+        pipeline = self.MODEL_CLASS.from_pretrained(model_id, ov_config=F32_CONFIG)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             pipeline.save_pretrained(tmp_dir)
-            pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir)
+            pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir, ov_config=F32_CONFIG)
 
         batch_size, height, width = 1, 128, 128
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
@@ -488,7 +491,7 @@ class OVLatentConsistencyModelPipelineTest(unittest.TestCase):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @unittest.skipIf(parse(_diffusers_version) <= Version("0.21.4"), "not supported with this diffusers version")
     def test_compare_to_diffusers(self, model_arch: str):
-        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG)
         self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
         self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
         self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)

From e189e4055a5c8bdd3676c1a9733324d1f6236b4a Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Thu, 25 Jan 2024 10:23:42 +0100
Subject: [PATCH 04/15] concurrency via model cloning in encoders and decoders

---
 optimum/intel/openvino/modeling.py      | 112 +++++++++++++++---------
 optimum/intel/openvino/modeling_base.py |  16 +++-
 2 files changed, 84 insertions(+), 44 deletions(-)

diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index e7f10fb7dd..b6242b0fa2 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -15,7 +15,7 @@
 import os
 from pathlib import Path
 from typing import Optional, Union
-
+import time
 import numpy as np
 import openvino
 import torch
@@ -180,6 +180,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -197,13 +198,15 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
         logits = (
-            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("logits").data
+            else self.request.get_tensor("logits").data
         )
         return SequenceClassifierOutput(logits=logits)
 
@@ -252,6 +255,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -269,18 +273,21 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
+
         start_logits = (
-            torch.from_numpy(infer_request.get_tensor("start_logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("start_logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("start_logits").data
+            else self.request.get_tensor("start_logits").data
         )
         end_logits = (
-            torch.from_numpy(infer_request.get_tensor("end_logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("end_logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("end_logits").data
+            else self.request.get_tensor("end_logits").data
         )
         return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits)
 
@@ -328,6 +335,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -345,13 +353,15 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec == True:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
         logits = (
-            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("logits").data
+            else self.request.get_tensor("logits").data
         )
         return TokenClassifierOutput(logits=logits)
 
@@ -399,6 +409,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -416,13 +427,15 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec == True:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
         last_hidden_state = (
-            torch.from_numpy(infer_request.get_tensor("last_hidden_state").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("last_hidden_state").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("last_hidden_state").data
+            else self.request.get_tensor("last_hidden_state").data
         )
         return BaseModelOutput(last_hidden_state=last_hidden_state)
 
@@ -471,6 +484,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -488,13 +502,15 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec == True:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
         logits = (
-            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("logits").data
+            else self.request.get_tensor("logits").data
         )
         return MaskedLMOutput(logits=logits)
 
@@ -611,6 +627,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(pixel_values, np.ndarray)
         if not np_inputs:
@@ -621,13 +638,15 @@ def forward(
         }
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec == True:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
         logits = (
-            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("logits").data
+            else self.request.get_tensor("logits").data
         )
         return ImageClassifierOutput(logits=logits)
 
@@ -677,6 +696,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_values, np.ndarray)
         if not np_inputs:
@@ -692,13 +712,15 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec == True:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
         logits = (
-            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("logits").data
+            else self.request.get_tensor("logits").data
         )
         return SequenceClassifierOutput(logits=logits)
 
@@ -756,6 +778,8 @@ def forward(
         attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None,
         **kwargs,
     ):
+        self.compile()
+        self.create_infer_request()
         np_inputs = isinstance(input_values, np.ndarray)
         if not np_inputs:
             input_values = np.array(input_values)
@@ -770,13 +794,15 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs)
-        infer_request.wait()
+        if self.async_exec == True:
+            self.request.start_async(inputs)
+            self.request.wait()
+        else:
+            self.request.infer(inputs)
         logits = (
-            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else infer_request.get_tensor("logits").data
+            else self.request.get_tensor("logits").data
         )
         return CausalLMOutput(logits=logits)
 
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index deff686e38..e65bdddb0c 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -89,6 +89,7 @@ def __init__(
 
         self.model = model
         self.request = None
+        self.async_exec = False
         self.compiled_model = None
         if enable_compilation:
             self.compile()
@@ -356,7 +357,11 @@ def compile(self):
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
-                _print_compiled_model_properties(self.request)
+                _print_compiled_model_properties(self.compiled_model)
+
+    def create_infer_request(self):
+        if self.request is None:
+            self.request = self.compiled_model.create_infer_request()
 
     def _reshape(
         self,
@@ -394,6 +399,7 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid
         """
         self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False
         self.model = self._reshape(self.model, batch_size, sequence_length, height, width)
+        self.compiled_model = None
         self.request = None
         return self
 
@@ -409,6 +415,14 @@ def half(self):
 
     def forward(self, *args, **kwargs):
         raise NotImplementedError
+    
+    def clone(self):
+        self.compile()
+        model_cloned = self.__class__(self.model, config=self.config, compile=False)
+        model_cloned.compiled_model = self.compiled_model
+        model_cloned.async_exec = True
+        model_cloned._device = self._device
+        return model_cloned
 
     def can_generate(self) -> bool:
         """

From f75021bfeb59861578e7bab3174a711a79c2b5fb Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Thu, 25 Jan 2024 12:44:52 +0100
Subject: [PATCH 05/15] fix clone performance

---
 optimum/intel/openvino/modeling_base.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 2d9b6b5ea9..e6fa93f2cf 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -17,7 +17,6 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory, gettempdir
 from typing import Dict, Optional, Union
-
 import openvino
 from huggingface_hub import hf_hub_download
 from openvino import Core, convert_model
@@ -69,12 +68,10 @@ def __init__(
         self.ov_config = ov_config if ov_config is not None else {"PERFORMANCE_HINT": "LATENCY"}
         self.preprocessors = kwargs.get("preprocessors", [])
         enable_compilation = kwargs.get("compile", True)
-
         if self.is_dynamic:
             height = -1 if self.export_feature == "image-classification" else None
             width = -1 if self.export_feature == "image-classification" else None
             model = self._reshape(model, -1, -1, height, width)
-
         input_names = {}
         for idx, key in enumerate(model.inputs):
             names = tuple(key.get_names())
@@ -86,14 +83,12 @@ def __init__(
             names = tuple(key.get_names())
             output_names[next((name for name in names if "/" not in name), names[0])] = idx
         self.output_names = output_names
-
         self.model = model
         self.request = None
         self.async_exec = False
         self.compiled_model = None
         if enable_compilation:
             self.compile()
-
         if is_transformers_version("<=", "4.25.1"):
             self.generation_config = None
         else:
@@ -422,7 +417,7 @@ def forward(self, *args, **kwargs):
     
     def clone(self):
         self.compile()
-        model_cloned = self.__class__(self.model, config=self.config, compile=False)
+        model_cloned = self.__class__(self.model, config=self.config, compile=False, dynamic_shapes=False)
         model_cloned.compiled_model = self.compiled_model
         model_cloned.async_exec = True
         model_cloned._device = self._device

From e9bb94116f7e2bd120f8534a3fbb9539ce0fd6df Mon Sep 17 00:00:00 2001
From: mzegla <milosz.zeglarski@intel.com>
Date: Fri, 26 Jan 2024 15:15:28 +0100
Subject: [PATCH 06/15] init

---
 tests/openvino/test_modeling.py | 94 ++++++++++++++++++++++++++++++++-
 tests/openvino/utils_tests.py   | 22 ++++++++
 2 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 74f033b95e..5fcc368252 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -26,6 +26,7 @@
 from datasets import load_dataset
 from evaluate import evaluator
 from parameterized import parameterized
+import threading
 from PIL import Image
 from transformers import (
     AutoFeatureExtractor,
@@ -50,7 +51,7 @@
     set_seed,
 )
 from transformers.onnx.utils import get_preprocessor
-from utils_tests import MODEL_NAMES
+from utils_tests import MODEL_NAMES, OVThread
 
 from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS
 from optimum.intel import (
@@ -253,7 +254,7 @@ def test_compare_to_transformers(self, model_arch):
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = "This is a sample input"
+        inputs = "This is sample input."
         tokens = tokenizer(inputs, return_tensors="pt")
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
@@ -268,6 +269,51 @@ def test_compare_to_transformers(self, model_arch):
         del ov_model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVModelForSequenceClassification.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs_list = ["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.",
+                       "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",
+                       "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.",
+                       "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",]
+        tokens_list = []
+        for inputs in inputs_list:
+            tokens_list.append(tokenizer(inputs, return_tensors="pt"))
+        
+        transformers_outputs_list = []
+        for tokens in tokens_list:
+            with torch.no_grad():
+                transformers_outputs_list.append(transformers_model(**tokens))
+
+        def run_ov_model(inputs, transformers_outputs):
+            for input_type in ["pt", "np"]:
+                tokens = tokenizer(inputs, return_tensors=input_type)
+                ov_outputs = ov_model(**tokens)
+                self.assertIn("logits", ov_outputs)
+                self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
+                # Compare tensor outputs
+                close_enough = torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)
+                #print(f"[{threading.get_ident()}] OV model logits: {ov_outputs.logits} Torch model logits: {transformers_outputs.logits} Close enough: {close_enough}")
+                self.assertTrue(close_enough)
+        
+        NUM_THREADS=4
+        threads = []
+        for i in range(NUM_THREADS):
+            threads.append(OVThread(target=run_ov_model, args=[inputs_list[i], transformers_outputs_list[i]]))
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+        
+        del transformers_model
+        del ov_model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -524,6 +570,50 @@ def test_compare_to_transformers(self, model_arch):
         del ov_model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        self.assertTrue(ov_model.use_cache)
+        self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode")
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs_list = ["This is a sample", "The sky is blue", "The money comes from", "Some other random sample"]
+        tokens_list = [tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None) for inputs in inputs_list]
+
+        def run_ov_model(tokens):
+            position_ids = None
+            if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
+                input_shape = tokens["input_ids"].shape
+                position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
+            ov_outputs = ov_model(**tokens, position_ids=position_ids)
+
+            self.assertTrue("logits" in ov_outputs)
+            self.assertIsInstance(ov_outputs.logits, torch.Tensor)
+            self.assertTrue("past_key_values" in ov_outputs)
+            self.assertIsInstance(ov_outputs.past_key_values, tuple)
+            if self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode":
+                self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens)
+            # Compare tensor outputs
+            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+
+        NUM_THREADS=4
+        threads = []
+        for i in range(NUM_THREADS):
+            threads.append(OVThread(target=run_ov_model, args=[tokens_list[i]]))
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+
+        del transformers_model
+        del ov_model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 6cfeb29bb4..f8bfc1cc97 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import torch
+import threading
 
 
 MODEL_NAMES = {
@@ -132,3 +133,24 @@ def get_num_quantized_nodes(ov_model):
             if "4" in elem.get_output_element_type(i).get_type_name():
                 num_int4 += 1
     return num_fake_quantize, num_int8, num_int4
+
+
+### Multithreading
+
+class OVThread(threading.Thread):
+    def __init__(self, target, args):
+        super().__init__()
+        self.target = target
+        self.args = args
+
+    def run(self):
+        self.exception = None
+        try:
+            self.target(*self.args)
+        except Exception as e:
+            self.exception = e
+
+    def join(self):
+        super().join()
+        if self.exception:
+            raise self.exception
\ No newline at end of file

From 5f85cbb7ccd8fc128b774a91073680221b6544f8 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Fri, 26 Jan 2024 15:44:36 +0100
Subject: [PATCH 07/15] fix next_beam_idx initialization

---
 optimum/intel/openvino/modeling_decoder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 1bd57042c8..88498d8c22 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -134,6 +134,7 @@ def __init__(
         self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
         self.key_value_input_names = [key for key in self.input_names if "key_values" in key]
         self.key_value_output_names = [key for key in self.output_names if "present" in key]
+        self.next_beam_idx = None
         is_stateful_supported = ensure_stateful_is_available(warn=False)
         if self.use_cache and not self.stateful:
             logger.warn(

From 912cf3abf365528bef68f050e140d39edfbc79b9 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Fri, 26 Jan 2024 16:08:43 +0100
Subject: [PATCH 08/15] init version

---
 optimum/intel/openvino/modeling_diffusion.py | 55 +++++++++++++++-----
 tests/openvino/gen_img.py                    | 15 ++++--
 2 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index f5bb5a72af..81d0f2bdc8 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -512,6 +512,24 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs):
     def _save_config(self, save_directory):
         self.save_config(save_directory)
 
+    def clone(self):
+        self.compile()
+        pipe_cloned = self.__class__(
+            self.unet.model,
+            self._internal_dict,
+            scheduler=self.scheduler,
+            vae_decoder=self.vae_decoder.model,
+            vae_encoder=self.vae_encoder.model,
+            text_encoder=self.text_encoder.model,
+#            text_encoder_2=self.text_encoder_2.clone(),
+            tokenizer=self.tokenizer,
+            tokenizer_2=self.tokenizer_2,
+            feature_extractor=self.feature_extractor)
+        pipe_cloned.unet = self.unet.clone()
+        pipe_cloned.text_encoder = self.text_encoder.clone()
+        pipe_cloned.vae_decoder = self.vae_decoder.clone()
+        pipe_cloned.vae_encoder = self.vae_encoder.clone()
+        return pipe_cloned
 
 class OVModelPart:
     CONFIG_NAME = "config.json"
@@ -533,6 +551,7 @@ def __init__(
         }
         self.ov_config = ov_config or {**self.parent_model.ov_config}
         self.compiled_model = None
+        self.request = None
         self._model_name = model_name
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
         config_path = self._model_dir / model_name / self.CONFIG_NAME
@@ -554,6 +573,16 @@ def _compile(self):
                 logger.info(f"{self.device} SUPPORTED_PROPERTIES:")
                 _print_compiled_model_properties(self.compiled_model)
 
+    def create_infer_request(self):
+        if self.request is None:
+            self.request = self.compiled_model.create_infer_request()
+
+    def clone(self):
+        model_cloned = self.__class__(self.model, self.parent_model, ov_config=self.ov_config)
+        model_cloned.model = self.model
+        model_cloned.compiled_model = self.compiled_model
+        return model_cloned
+
     @property
     def device(self):
         return self.parent_model._device
@@ -571,16 +600,16 @@ def __init__(
 
     def __call__(self, input_ids: np.ndarray):
         self._compile()
+        self.create_infer_request()
 
         inputs = {
             "input_ids": input_ids,
         }
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs, share_inputs=True)
-        infer_request.wait()
-        outputs = [infer_request.get_tensor(output).data for output in infer_request.results]
+        self.request.start_async(inputs, share_inputs=True)
+        self.request.wait()
+        outputs = [self.request.get_tensor(output).data for output in self.request.results]
         return outputs
-
+    
 
 class OVModelUnet(OVModelPart):
     def __init__(
@@ -598,6 +627,7 @@ def __call__(
         timestep_cond: Optional[np.ndarray] = None,
     ):
         self._compile()
+        self.create_infer_request()
 
         inputs = {
             "sample": sample,
@@ -612,10 +642,9 @@ def __call__(
         if timestep_cond is not None:
             inputs["timestep_cond"] = timestep_cond
 
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs, share_inputs=True)
-        infer_request.wait()
-        outputs = [infer_request.get_tensor(output).data for output in infer_request.results]
+        self.request.start_async(inputs, share_inputs=True)
+        self.request.request.wait()
+        outputs = [self.request.get_tensor(output).data for output in self.results]
         return outputs
 
 
@@ -627,14 +656,14 @@ def __init__(
 
     def __call__(self, latent_sample: np.ndarray):
         self._compile()
+        self.create_infer_request()
 
         inputs = {
             "latent_sample": latent_sample,
         }
-        infer_request = self.compiled_model.create_infer_request()
-        infer_request.start_async(inputs, share_inputs=True)
-        infer_request.wait()
-        outputs = [infer_request.results[output].data for output in infer_request.results]
+        self.request.start_async(inputs, share_inputs=True)
+        self.request.wait()
+        outputs = [self.request.results[output].data for output in self.request.results]
         return outputs
 
     def _compile(self):
diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py
index 6e06132ba5..9f17b7a1e6 100644
--- a/tests/openvino/gen_img.py
+++ b/tests/openvino/gen_img.py
@@ -6,19 +6,25 @@
 from optimum.intel.openvino import OVStableDiffusionPipeline
 
 
-MODEL_PATH = "/model"
+MODEL_PATH = "/home/devuser/model_server/demos/python_demos/stable_diffusion/model"
 OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"}
 
 
 pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="CPU", ov_config=OV_CONFIG)
 pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
 
+vae_decoder_clon = pipe.vae_decoder.clone()
+print(vae_decoder_clon, vae_decoder_clon.model)
+
+unet_clon = pipe.unet.clone()
+print(unet_clon, unet_clon.model)
+
 prompt1 = [" Zebras in space "]
 prompt2 = [" The statue of liberty in New York", " Big Ben in London "]
 prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"]
 prompts = [prompt1, prompt2, prompt3]
 
-NUM_THREADS = 3
+NUM_THREADS = 1
 
 threads = [None] * NUM_THREADS
 results = [None] * NUM_THREADS
@@ -33,8 +39,11 @@ def save_response(t, p, r):
 
 
 def gen_thread(prompt, results, i):
+    print("cloning pipe")
+    pipe_exec = pipe.clone()
+    print("pipe cloned")
     text = prompt
-    images = pipe(text).images
+    images = pipe_exec(text).images
     results[i] = images
 
 

From 03b548d3d8e5b372521d4ab280f172d017d38e2c Mon Sep 17 00:00:00 2001
From: mzegla <milosz.zeglarski@intel.com>
Date: Mon, 29 Jan 2024 17:22:19 +0100
Subject: [PATCH 09/15] more tests

---
 tests/openvino/test_modeling.py         | 177 +++++++++++++++++++-----
 tests/openvino/test_stable_diffusion.py |  53 ++++++-
 tests/openvino/utils_tests.py           |  13 +-
 3 files changed, 206 insertions(+), 37 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 5fcc368252..5e4767c8c4 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -26,7 +26,7 @@
 from datasets import load_dataset
 from evaluate import evaluator
 from parameterized import parameterized
-import threading
+import pytest
 from PIL import Image
 from transformers import (
     AutoFeatureExtractor,
@@ -51,7 +51,7 @@
     set_seed,
 )
 from transformers.onnx.utils import get_preprocessor
-from utils_tests import MODEL_NAMES, OVThread
+from utils_tests import MODEL_NAMES, run_on_multiple_threads
 
 from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS
 from optimum.intel import (
@@ -270,6 +270,7 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.skip(reason="Clone operation not implemented for this class of models")
     def test_compare_to_transformers_multithreading(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
@@ -277,23 +278,19 @@ def test_compare_to_transformers_multithreading(self, model_arch):
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs_list = ["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.",
-                       "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",
-                       "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.",
-                       "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",]
-        tokens_list = []
-        for inputs in inputs_list:
-            tokens_list.append(tokenizer(inputs, return_tensors="pt"))
-        
-        transformers_outputs_list = []
-        for tokens in tokens_list:
-            with torch.no_grad():
-                transformers_outputs_list.append(transformers_model(**tokens))
+        inputs_list = [["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."],
+                       ["This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen"],
+                       ["This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."],
+                       ["This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",]]
 
-        def run_ov_model(inputs, transformers_outputs):
+        def run_ov_model(inputs):
+            tokens = tokenizer(inputs, return_tensors="pt")
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens)
+            ov_model_instance = ov_model.clone()
             for input_type in ["pt", "np"]:
                 tokens = tokenizer(inputs, return_tensors=input_type)
-                ov_outputs = ov_model(**tokens)
+                ov_outputs = ov_model_instance(**tokens)
                 self.assertIn("logits", ov_outputs)
                 self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
                 # Compare tensor outputs
@@ -301,14 +298,7 @@ def run_ov_model(inputs, transformers_outputs):
                 #print(f"[{threading.get_ident()}] OV model logits: {ov_outputs.logits} Torch model logits: {transformers_outputs.logits} Close enough: {close_enough}")
                 self.assertTrue(close_enough)
         
-        NUM_THREADS=4
-        threads = []
-        for i in range(NUM_THREADS):
-            threads.append(OVThread(target=run_ov_model, args=[inputs_list[i], transformers_outputs_list[i]]))
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join()
+        run_on_multiple_threads(run_ov_model, args_list=inputs_list)
         
         del transformers_model
         del ov_model
@@ -580,15 +570,16 @@ def test_compare_to_transformers_multithreading(self, model_arch):
         self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode")
         transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs_list = ["This is a sample", "The sky is blue", "The money comes from", "Some other random sample"]
-        tokens_list = [tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None) for inputs in inputs_list]
+        inputs_list = ["This is a sample", "Here is another sample", "That's the thrid one", "This is the last sample"]
+        tokens_list = [[tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None)] for inputs in inputs_list]
 
         def run_ov_model(tokens):
+            ov_model_instance = ov_model.clone()
             position_ids = None
             if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
                 input_shape = tokens["input_ids"].shape
                 position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
-            ov_outputs = ov_model(**tokens, position_ids=position_ids)
+            ov_outputs = ov_model_instance(**tokens, position_ids=position_ids)
 
             self.assertTrue("logits" in ov_outputs)
             self.assertIsInstance(ov_outputs.logits, torch.Tensor)
@@ -601,14 +592,7 @@ def run_ov_model(tokens):
             # Compare tensor outputs
             self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
 
-        NUM_THREADS=4
-        threads = []
-        for i in range(NUM_THREADS):
-            threads.append(OVThread(target=run_ov_model, args=[tokens_list[i]]))
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join()
+        run_on_multiple_threads(run_ov_model, args_list=tokens_list)
 
         del transformers_model
         del ov_model
@@ -631,6 +615,28 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.xfail(reason="use_cache not handled properly during clone()")
+    def test_pipeline_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False, compile=False)
+        model.config.encoder_no_repeat_ngram_size = 0
+        model.to("cpu")
+        model.half()
+        model.compile()
+        def run_ov_model(input_text):
+            # Tokenizer is not supposed to be shared by multiple threads
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            pipe = pipeline("text-generation", model=model.clone(), tokenizer=tokenizer)
+            outputs = pipe(input_text, max_length=10)
+            self.assertEqual(pipe.device, model.device)
+            self.assertTrue(all(input_text in item["generated_text"] for item in outputs))
+            del pipe
+        inputs_list = [["This is a sample"], ["This is a second sample"], ["This is a third sample"]]
+        run_on_multiple_threads(run_ov_model, args_list=inputs_list)
+        del model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_multiple_inputs(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -647,6 +653,28 @@ def test_multiple_inputs(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_multiple_inputs_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"]
+        tokens = tokenizer(texts, padding=True, return_tensors="pt")
+        generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2)
+
+        def run_ov_model(tokens):
+            model_instance = model.clone()
+            outputs = model_instance.generate(**tokens, generation_config=generation_config)
+            self.assertIsInstance(outputs, torch.Tensor)
+            self.assertEqual(outputs.shape[0], 3)
+        
+        tokens_list = [[tokens], [tokens], [tokens], [tokens]] # running in 4 threads
+        run_on_multiple_threads(run_ov_model, args_list=tokens_list)
+        del model
+        gc.collect()
+
     def test_model_and_decoder_same_device(self):
         model_id = MODEL_NAMES["gpt2"]
         model = OVModelForCausalLM.from_pretrained(model_id, export=True)
@@ -926,6 +954,48 @@ def test_compare_to_transformers(self, model_arch):
 
         gc.collect()
 
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    # This works the old way - infer request per inference, no cloning
+    def test_compare_to_transformers_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+
+        self.assertIsInstance(ov_model.encoder, OVEncoder)
+        self.assertIsInstance(ov_model.decoder, OVDecoder)
+        self.assertIsInstance(ov_model.decoder_with_past, OVDecoder)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+
+        transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        inputs_list = ["This is a sample input for the first thread", 
+                       "Input sample for another thread", 
+                       "This is a third sample input", 
+                       "This last sample is for the last thread"]
+        args_list = []
+        for inputs in inputs_list:
+            tokens = tokenizer(inputs, return_tensors="pt")
+            decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+            decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
+            args_list.append([tokens, decoder_inputs])
+        
+        def run_ov_model(tokens, decoder_inputs):
+            ov_outputs = ov_model(**tokens, **decoder_inputs)
+            self.assertTrue("logits" in ov_outputs)
+            self.assertIsInstance(ov_outputs.logits, torch.Tensor)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens, **decoder_inputs)
+            # Compare tensor outputs
+            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+        
+        run_on_multiple_threads(run_ov_model, args_list=args_list)
+        del transformers_model
+        del ov_model
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -959,6 +1029,43 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    # This works the old way - infer request per inference, no cloning
+    def test_pipeline_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, compile=False)
+        model.half()
+        model.to("cpu")
+        model.compile()
+
+        def run_ov_model(text):
+            # Tokenizer is not supposed to be shared between multiple threads
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            # Text2Text generation
+            pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+            outputs = pipe(text)
+            self.assertEqual(pipe.device, model.device)
+            self.assertIsInstance(outputs[0]["generated_text"], str)
+
+            # Summarization
+            pipe = pipeline("summarization", model=model, tokenizer=tokenizer)
+            outputs = pipe(text)
+            self.assertEqual(pipe.device, model.device)
+            self.assertIsInstance(outputs[0]["summary_text"], str)
+
+            # Translation
+            pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
+            outputs = pipe(text)
+            self.assertEqual(pipe.device, model.device)
+            self.assertIsInstance(outputs[0]["translation_text"], str)
+            del pipe
+
+        texts_list = [["This is a test"], ["This is a test, but for another thread"], ["Yet another test"]]
+        run_on_multiple_threads(target=run_ov_model, args_list=texts_list)
+        del model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py
index 36b1794015..84e94bcae7 100644
--- a/tests/openvino/test_stable_diffusion.py
+++ b/tests/openvino/test_stable_diffusion.py
@@ -30,7 +30,8 @@
 from openvino.runtime.ie_api import CompiledModel
 from packaging.version import Version, parse
 from parameterized import parameterized
-from utils_tests import MODEL_NAMES, SEED
+import pytest
+from utils_tests import MODEL_NAMES, SEED, run_on_multiple_threads
 
 from optimum.intel import (
     OVLatentConsistencyModelPipeline,
@@ -251,6 +252,56 @@ def test_compare_to_diffusers(self, model_arch: str):
         # Compare model devices
         self.assertEqual(pipeline.device.type, ov_pipeline.device)
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.xfail(reason="OVStableDiffusionPipeline does not implement clone()")
+    def test_compare_to_diffusers_multithreading(self, model_arch: str):
+        model_id = MODEL_NAMES[model_arch]
+        ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
+        self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
+        self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
+        self.assertIsInstance(ov_pipeline.unet, OVModelUnet)
+        self.assertIsInstance(ov_pipeline.config, Dict)
+
+        pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        pipeline.safety_checker = None
+        batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64
+
+        def run_ov_model(prompt):
+            ov_pipeline_instance = ov_pipeline.clone()
+            latents = ov_pipeline_instance.prepare_latents(
+                batch_size * num_images_per_prompt,
+                ov_pipeline_instance.unet.config["in_channels"],
+                height,
+                width,
+                dtype=np.float32,
+                generator=np.random.RandomState(0),
+            )
+
+            kwargs = {
+                "prompt": prompt,
+                "num_inference_steps": 1,
+                "num_images_per_prompt": num_images_per_prompt,
+                "height": height,
+                "width": width,
+                "guidance_rescale": 0.1,
+            }
+
+            for output_type in ["latent", "np"]:
+                ov_outputs = ov_pipeline_instance(latents=latents, output_type=output_type, **kwargs).images
+                self.assertIsInstance(ov_outputs, np.ndarray)
+                with torch.no_grad():
+                    outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
+                # Compare model outputs
+                self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
+
+            # Compare model devices
+            self.assertEqual(pipeline.device.type, ov_pipeline.device)
+
+        prompt_list = [["sailing ship in storm by Leonardo da Vinci"], ["central park during christmas"], ["zebras in space"]]
+        run_on_multiple_threads(target=run_ov_model, args_list=prompt_list)
+
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_image_reproducibility(self, model_arch: str):
         model_id = MODEL_NAMES[model_arch]
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index f8bfc1cc97..1a202c1dd1 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -153,4 +153,15 @@ def run(self):
     def join(self):
         super().join()
         if self.exception:
-            raise self.exception
\ No newline at end of file
+            raise self.exception
+        
+# Each set of args is run in a separate thread.
+# Amount of such sets define how many threads are spawned.
+def run_on_multiple_threads(target, args_list):
+    threads = []
+    for args in args_list:
+            threads.append(OVThread(target=target, args=args))
+    for thread in threads:
+            thread.start()
+    for thread in threads:
+        thread.join()
\ No newline at end of file

From e5d9c755afd3e6d605f34142d44395115c1d184a Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Wed, 14 Feb 2024 15:08:52 +0100
Subject: [PATCH 10/15] running conncurrent execution of stable diffusion pipe
 with cloning

---
 optimum/intel/openvino/modeling_diffusion.py | 61 +++++++++++---------
 tests/openvino/gen_img.py                    | 15 +++--
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 81d0f2bdc8..5e12279d9b 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -94,7 +94,7 @@ def __init__(
         self._model_save_dir = (
             Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir
         )
-        self.vae_decoder = OVModelVaeDecoder(vae_decoder, self)
+        self.vae_decoder = OVModelVaeDecoder(vae_decoder, self) if vae_decoder is not None else None
         self.unet = OVModelUnet(unet, self)
         self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None
         self.text_encoder_2 = (
@@ -104,13 +104,12 @@ def __init__(
         )
         self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None
 
-        if "block_out_channels" in self.vae_decoder.config:
-            self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
-        else:
-            self.vae_scale_factor = 8
-
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
+        if vae_decoder is not None:
+            if "block_out_channels" in self.vae_decoder.config:
+                self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
+            else:
+                self.vae_scale_factor = 8
+            self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.tokenizer = tokenizer
         self.tokenizer_2 = tokenizer_2
         self.scheduler = scheduler
@@ -270,8 +269,8 @@ def _from_pretrained(
 
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
-
-        return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
+        pipe = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
+        return pipe
 
     @classmethod
     def _from_transformers(
@@ -514,21 +513,29 @@ def _save_config(self, save_directory):
 
     def clone(self):
         self.compile()
-        pipe_cloned = self.__class__(
-            self.unet.model,
-            self._internal_dict,
-            scheduler=self.scheduler,
-            vae_decoder=self.vae_decoder.model,
-            vae_encoder=self.vae_encoder.model,
-            text_encoder=self.text_encoder.model,
-#            text_encoder_2=self.text_encoder_2.clone(),
-            tokenizer=self.tokenizer,
-            tokenizer_2=self.tokenizer_2,
-            feature_extractor=self.feature_extractor)
+        config = self._internal_dict
+        scheduler = self.scheduler
+        unet = self.unet.model
+        model_save_dir = self._model_save_dir
+        pipe_cloned = self.__class__(unet=unet,
+                                     config=config,
+                                     scheduler=scheduler,
+                                     compile=False,
+                                     dynamic_shapes=False,
+                                     model_save_dir=model_save_dir)
         pipe_cloned.unet = self.unet.clone()
-        pipe_cloned.text_encoder = self.text_encoder.clone()
-        pipe_cloned.vae_decoder = self.vae_decoder.clone()
-        pipe_cloned.vae_encoder = self.vae_encoder.clone()
+        if self.vae_decoder is not None:
+            pipe_cloned.vae_decoder = self.vae_decoder.clone()
+        if self.text_encoder is not None:
+            pipe_cloned.text_encoder= self.text_encoder.clone()
+        if self.text_encoder_2 is not None:
+            pipe_cloned.text_encoder_2= self.text_encoder_2.clone()
+        if self.vae_encoder is not None:
+            pipe_cloned.vae_encoder= self.vae_encoder.clone()
+        pipe_cloned.vae_scale_factor = self.vae_scale_factor
+        pipe_cloned.image_processor = self.image_processor
+        pipe_cloned.tokenizer = self.tokenizer
+        pipe_cloned.tokenizer_2 = self.tokenizer_2
         return pipe_cloned
 
 class OVModelPart:
@@ -566,7 +573,7 @@ def _compile(self):
             ):
                 self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")
 
-            logger.info(f"Compiling the {self._model_name} to {self.device} ...")
+            logger.info(f"Compiling the {self._model_name} to {self.device} with config {self.ov_config} ... ")
             self.compiled_model = core.compile_model(self.model, self.device, self.ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
@@ -643,8 +650,8 @@ def __call__(
             inputs["timestep_cond"] = timestep_cond
 
         self.request.start_async(inputs, share_inputs=True)
-        self.request.request.wait()
-        outputs = [self.request.get_tensor(output).data for output in self.results]
+        self.request.wait()
+        outputs = [self.request.get_tensor(output).data for output in self.request.results]
         return outputs
 
 
diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py
index 9f17b7a1e6..1fe61b2239 100644
--- a/tests/openvino/gen_img.py
+++ b/tests/openvino/gen_img.py
@@ -6,25 +6,23 @@
 from optimum.intel.openvino import OVStableDiffusionPipeline
 
 
-MODEL_PATH = "/home/devuser/model_server/demos/python_demos/stable_diffusion/model"
+MODEL_PATH = "/model"
 OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"}
 
 
-pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="CPU", ov_config=OV_CONFIG)
+pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="GPU", ov_config=OV_CONFIG, compile=True, dynamic_shapes=True)
 pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
 
 vae_decoder_clon = pipe.vae_decoder.clone()
-print(vae_decoder_clon, vae_decoder_clon.model)
-
 unet_clon = pipe.unet.clone()
-print(unet_clon, unet_clon.model)
 
 prompt1 = [" Zebras in space "]
 prompt2 = [" The statue of liberty in New York", " Big Ben in London "]
 prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"]
+
 prompts = [prompt1, prompt2, prompt3]
 
-NUM_THREADS = 1
+NUM_THREADS = 3
 
 threads = [None] * NUM_THREADS
 results = [None] * NUM_THREADS
@@ -39,9 +37,10 @@ def save_response(t, p, r):
 
 
 def gen_thread(prompt, results, i):
-    print("cloning pipe")
+    start = datetime.datetime.now()
     pipe_exec = pipe.clone()
-    print("pipe cloned")
+    end = datetime.datetime.now()
+    print("Clonning time [s]", ((end - start).total_seconds()))
     text = prompt
     images = pipe_exec(text).images
     results[i] = images

From 34e7e288f11a7d824942d1c63d0e7d0151b3b042 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Fri, 16 Feb 2024 16:30:57 +0100
Subject: [PATCH 11/15] add concurrency examples

---
 examples/openvino/multithreading/README.md    | 49 +++++++++++
 examples/openvino/multithreading/gen_image.py | 60 ++++++++++++++
 .../openvino/multithreading/gen_seq2seq.py    | 51 ++++++++++++
 examples/openvino/multithreading/gen_text.py  | 81 +++++++++++++++++++
 .../openvino/multithreading/requirements.txt  |  2 +
 5 files changed, 243 insertions(+)
 create mode 100644 examples/openvino/multithreading/README.md
 create mode 100644 examples/openvino/multithreading/gen_image.py
 create mode 100644 examples/openvino/multithreading/gen_seq2seq.py
 create mode 100644 examples/openvino/multithreading/gen_text.py
 create mode 100644 examples/openvino/multithreading/requirements.txt

diff --git a/examples/openvino/multithreading/README.md b/examples/openvino/multithreading/README.md
new file mode 100644
index 0000000000..19759a1f30
--- /dev/null
+++ b/examples/openvino/multithreading/README.md
@@ -0,0 +1,49 @@
+# Execution in multi-threaded environment
+
+## Overview
+
+This example demonstrates how to execute the pipelines from Hugging Face transformers with multi concurency.
+A typical scenrio is with multi threaded application without duplicating the model allocation in the host memeory.
+
+By default, the execution of the transformers with OpenVINO Runtime backend is single threaded. Runing the generation process parallel can cause an error
+`RuntimeError: Infer Request is busy`.
+
+A simple technic can overcome this limitation using `clone` method on the model or a pipeline. It duplicates the execution object while sharing the OpenVINO compiled model in the host memory. The clone object should not change the model by reshaping, changing precision and recompiling.
+The snippet below applies this concept:
+
+```python
+pipe = OVStableDiffusionPipeline.from_pretrained(
+    MODEL_PATH, ov_config=OV_CONFIG, compile=True
+)
+def thread(prompt, results):
+    pipe_exec = pipe.clone()
+    images = pipe_exec(prompt).images
+    # Do something with images
+
+T1 = threading.Thread(target=thread, args=("my prompt"))
+T1.start()
+```
+Note that the `clone` operation is quick and is not duplicating the memory usage. It just creates new context for the generating algorithm.
+
+Check the simple examples how it can be applied in practice.
+
+## Preparing python environment
+```bash
+pip install -r examples/openvino/multithreading/requirement.txt
+```
+
+## Text generation
+
+```bash
+python examples/openvino/multithreading/gen_text.py
+```
+## Image generation
+```bash
+python examples/openvino/multithreading/gen_text.py
+```
+
+## Text translation with seq2seq
+
+```bash
+python examples/openvino/multithreading/gen_seq2seq.py
+```
diff --git a/examples/openvino/multithreading/gen_image.py b/examples/openvino/multithreading/gen_image.py
new file mode 100644
index 0000000000..ffa392f26e
--- /dev/null
+++ b/examples/openvino/multithreading/gen_image.py
@@ -0,0 +1,60 @@
+import datetime
+import threading
+
+from optimum.intel.openvino import OVStableDiffusionPipeline
+
+
+MODEL_PATH = "runwayml/stable-diffusion-v1-5"
+OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"}
+
+
+pipe = OVStableDiffusionPipeline.from_pretrained(
+    MODEL_PATH,  ov_config=OV_CONFIG, compile=True, dynamic_shapes=True, export=True
+)
+
+vae_decoder_clon = pipe.vae_decoder.clone()
+unet_clon = pipe.unet.clone()
+
+prompt1 = [" Zebras in space "]
+prompt2 = [" The statue of liberty in New York", " Big Ben in London "]
+prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"]
+
+prompts = [prompt1, prompt2, prompt3]
+
+NUM_THREADS = 3
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def save_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for i in range(len(r)):
+        print("IMG:", i)
+        r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG")
+
+def gen_thread(prompt, results, i):
+    start = datetime.datetime.now()
+    pipe_exec = pipe.clone()
+    end = datetime.datetime.now()
+    print("Clonning time [s]", ((end - start).total_seconds()))
+    text = prompt
+    images = pipe_exec(text).images
+    results[i] = images
+
+start = datetime.datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+nu_img = 0
+for i in range(len(threads)):
+    threads[i].join()
+    nu_img += len(results[i])
+end = datetime.datetime.now()
+
+for i in range(len(threads)):
+    save_response(i, prompts[i], results[i])
+
+print("Generation time [s]", ((end - start).total_seconds()), "images:", nu_img)
+print("Throughput:", nu_img * 60 / ((end - start).total_seconds()), "images/min")
diff --git a/examples/openvino/multithreading/gen_seq2seq.py b/examples/openvino/multithreading/gen_seq2seq.py
new file mode 100644
index 0000000000..27d3ed2a45
--- /dev/null
+++ b/examples/openvino/multithreading/gen_seq2seq.py
@@ -0,0 +1,51 @@
+import datetime
+import threading
+
+from transformers import AutoTokenizer, pipeline
+
+from optimum.intel import OVModelForSeq2SeqLM
+
+
+model_id = "echarlaix/t5-small-openvino"
+model = OVModelForSeq2SeqLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
+
+prompt1 = ["I live in Europe"]
+prompt2 = ["What is your name?", "The dog is very happy"]
+prompt3 = ["It's a beautiful weather today", "Yes", "Good morning"]
+prompts = [prompt1, prompt2, prompt3]
+
+NUM_THREADS = 3
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def print_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for i in range(len(r)):
+        print("TRANSLATION", i, ":", r[i]["translation_text"])
+
+
+def gen_thread(prompt, results, i):
+    translations = pipe(prompt)
+    results[i] = translations
+
+
+start = datetime.datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+nu_trans = 0
+for i in range(len(threads)):
+    threads[i].join()
+    nu_trans += len(results[i])
+end = datetime.datetime.now()
+
+for i in range(len(threads)):
+    print_response(i, prompts[i], results[i])
+
+print("Generation time [s]", ((end - start).total_seconds()), "translations:", nu_trans)
+print("Throughput:", nu_trans / ((end - start).total_seconds()), "translations/s")
diff --git a/examples/openvino/multithreading/gen_text.py b/examples/openvino/multithreading/gen_text.py
new file mode 100644
index 0000000000..2fed55276e
--- /dev/null
+++ b/examples/openvino/multithreading/gen_text.py
@@ -0,0 +1,81 @@
+import threading
+from datetime import datetime
+from transformers import AutoConfig, AutoTokenizer, set_seed
+from optimum.intel import OVModelForCausalLM
+
+
+set_seed(10)
+model_id = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.pad_token = "[PAD]"
+tokenizer.padding_side = "left"
+NUM_THREADS = 3
+prompt1 = ["<human>: Question: What is the weather like now? Answer: <bot>"]
+prompt2 = ["<human>: Question:  What is Openvino?", "<human>: Question: What the the relativity theory? Answer: <bot>"]
+prompt3 = [
+    "<human>: Question: Are cats smarter that dogs? Answer: <bot>",
+    "<human>: Question: How big is an elephant? Answer: <bot>",
+    "<human>: Question: The water in the ocean is much hotter than before? Answer: <bot>",
+]
+prompts = [prompt1, prompt2, prompt3]
+
+OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": "", "NUM_STREAMS": "2"}
+model = OVModelForCausalLM.from_pretrained(
+    model_id,
+    config=AutoConfig.from_pretrained(model_id, trust_remote_code=True),
+    ov_config=OV_CONFIG,
+    compile=True,
+    export=True
+)
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def print_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for answer in r:
+        print("Answer:")
+        print(tokenizer.decode(answer, skip_special_tokens=True))
+
+
+def gen_thread(prompt, results, i):
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
+    generate_kwargs = {
+        "input_ids": inputs.input_ids,
+        "max_new_tokens": 200,
+        "temperature": 1.0,
+        "do_sample": True,
+        "top_p": 1.0,
+        "top_k": 50,
+        "num_beams": 5,
+        "repetition_penalty": 1.1,
+    }
+    start = datetime.now()
+    model_exec = model.clone()
+    end = datetime.now()
+    print("cloning model duration", (end - start).total_seconds() * 1000000, "us")
+    outputs = model_exec.generate(**generate_kwargs)
+    num_tok = 0
+    for x in range(len(prompt)):
+        num_tok += outputs[x].numel() - inputs.get("input_ids")[x].numel()
+    results[i] = outputs, num_tok
+
+
+start = datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+
+total_tok = 0
+for i in range(len(threads)):
+    threads[i].join()
+    total_tok += results[i][1]
+end = datetime.now()
+
+for i in range(len(threads)):
+    print_response(i, prompts[i], results[i][0])
+
+print("Generation time [s]", ((end - start).total_seconds()), "tokens:", total_tok)
+print("Throughput:", total_tok * 60 / ((end - start).total_seconds()), "tokens/min")
diff --git a/examples/openvino/multithreading/requirements.txt b/examples/openvino/multithreading/requirements.txt
new file mode 100644
index 0000000000..b05f68f799
--- /dev/null
+++ b/examples/openvino/multithreading/requirements.txt
@@ -0,0 +1,2 @@
+optimum-intel[openvino, nncf]"@git+https://github.com/huggingface/optimum-intel.git
+transformers
\ No newline at end of file

From f4d21d8fb4e938fc49dd5d33f7c5a64fbb217c9d Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Fri, 16 Feb 2024 16:32:43 +0100
Subject: [PATCH 12/15] preserve request attribure as deprecated

---
 optimum/intel/openvino/modeling.py           | 85 ++++++++++----------
 optimum/intel/openvino/modeling_base.py      | 14 ++--
 optimum/intel/openvino/modeling_decoder.py   | 17 ++--
 optimum/intel/openvino/modeling_diffusion.py | 26 +++---
 optimum/intel/openvino/modeling_seq2seq.py   |  7 +-
 5 files changed, 80 insertions(+), 69 deletions(-)

diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index 69dc29a6b6..aafd377991 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -132,6 +132,7 @@ def to(self, device: str):
         if isinstance(device, str):
             self._device = device.upper()
             self.compiled_model = None
+            self.request = None
         else:
             logger.warning(f"device must be of type {str} but got {type(device)} instead")
 
@@ -203,14 +204,14 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
         logits = (
-            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("logits").data
+            else self.infer_request.get_tensor("logits").data
         )
         return SequenceClassifierOutput(logits=logits)
 
@@ -278,20 +279,20 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
 
         start_logits = (
-            torch.from_numpy(self.request.get_tensor("start_logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("start_logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("start_logits").data
+            else self.infer_request.get_tensor("start_logits").data
         )
         end_logits = (
-            torch.from_numpy(self.request.get_tensor("end_logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("end_logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("end_logits").data
+            else self.infer_request.get_tensor("end_logits").data
         )
         return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits)
 
@@ -358,14 +359,14 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
         logits = (
-            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("logits").data
+            else self.infer_request.get_tensor("logits").data
         )
         return TokenClassifierOutput(logits=logits)
 
@@ -432,14 +433,14 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
         last_hidden_state = (
-            torch.from_numpy(self.request.get_tensor("last_hidden_state").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("last_hidden_state").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("last_hidden_state").data
+            else self.infer_request.get_tensor("last_hidden_state").data
         )
         return BaseModelOutput(last_hidden_state=last_hidden_state)
 
@@ -507,14 +508,14 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
         logits = (
-            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("logits").data
+            else self.infer_request.get_tensor("logits").data
         )
         return MaskedLMOutput(logits=logits)
 
@@ -643,14 +644,14 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
         logits = (
-            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("logits").data
+            else self.infer_request.get_tensor("logits").data
         )
         return ImageClassifierOutput(logits=logits)
 
@@ -717,14 +718,14 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
         logits = (
-            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("logits").data
+            else self.infer_request.get_tensor("logits").data
         )
         return SequenceClassifierOutput(logits=logits)
 
@@ -799,14 +800,14 @@ def forward(
 
         # Run inference
         if self.async_exec:
-            self.request.start_async(inputs)
-            self.request.wait()
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
         else:
-            self.request.infer(inputs)
+            self.infer_request.infer(inputs)
         logits = (
-            torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
             if not np_inputs
-            else self.request.get_tensor("logits").data
+            else self.infer_request.get_tensor("logits").data
         )
         return CausalLMOutput(logits=logits)
 
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 000afee021..287803a62d 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -81,7 +81,8 @@ def __init__(
             output_names[next((name for name in names if "/" not in name), names[0])] = idx
         self.output_names = output_names
         self.model = model
-        self.request = None
+        self.request = None # Deprecated attribute, use compiled_model instead
+        self.infer_request = None
         self.async_exec = False
         self.compiled_model = None
         if enable_compilation:
@@ -354,14 +355,15 @@ def compile(self):
                 ov_config["CACHE_DIR"] = str(cache_dir)
                 logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}")
             self.compiled_model = core.compile_model(self.model, self._device, ov_config)
+            self.request = self.compiled_model # Deprecated attribute, use compiled_model instead
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
                 _print_compiled_model_properties(self.compiled_model)
 
     def create_infer_request(self):
-        if self.request is None:
-            self.request = self.compiled_model.create_infer_request()
+        if self.infer_request is None:
+            self.infer_request = self.compiled_model.create_infer_request()
 
     def _reshape(
         self,
@@ -400,7 +402,8 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid
         self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False
         self.model = self._reshape(self.model, batch_size, sequence_length, height, width)
         self.compiled_model = None
-        self.request = None
+        self.infer_request = None 
+        self.request = None  # Deprecated attribute, use compiled_model instead
         return self
 
     def half(self):
@@ -409,8 +412,9 @@ def half(self):
         """
         apply_moc_transformations(self.model, cf=False)
         compress_model_transformation(self.model)
-        self.request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
         self.compiled_model = None
+        self.infer_request = None
         return self
 
     def forward(self, *args, **kwargs):
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index c00f916dbc..fde2cdc7bd 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -206,7 +206,8 @@ def update_pkv_precision(self, force_fp32=False):
                 self.model = self._original_model.clone()
                 if self.is_dynamic:
                     self.model = self._reshape(self.model, -1, -1)
-                self.request = None
+                self.request = None  # Deprecated attribute, use compiled_model instead
+                self.compiled_model = None
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         """
@@ -342,8 +343,8 @@ def _make_stateful(self):
     def create_infer_request(self):
         if self.compiled_model is None:
             self.compile()
-        if self.request is None:
-            self.request = self.compiled_model.create_infer_request()
+        if self.infer_request is None:
+            self.infer_request = self.compiled_model.create_infer_request()
 
 
 @add_start_docstrings(
@@ -423,7 +424,7 @@ def prepare_inputs(
             # past_key_values are not used explicitly, instead they are handled inside the model
             if past_key_values is None:
                 # This is the first iteration in a sequence, reset all states
-                self.request.reset_state()
+                self.infer_request.reset_state()
                 # Set initial value for the next beam_idx input that will be used at the current iteration
                 # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
@@ -479,9 +480,9 @@ def forward(
         )
 
         # Run inference
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        logits = torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
         if self.stateful:
             # Need a marker to differentiate the first generate iteration from the others in
             # the first condition at the function beginning above.
@@ -491,7 +492,7 @@ def forward(
         if not self.stateful:
             if self.use_cache:
                 # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer)
-                past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
+                past_key_values = tuple(self.infer_request.get_tensor(key).data for key in self.key_value_output_names)
                 if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
                     # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention)
                     past_key_values = tuple(
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 699a745ffa..a6b6d012d3 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -572,7 +572,8 @@ def __init__(
         }
         self.ov_config = ov_config or {**self.parent_model.ov_config}
         self.compiled_model = None
-        self.request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
+        self.infer_request = None
         self._model_name = model_name
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
         config_path = self._model_dir / model_name / self.CONFIG_NAME
@@ -589,14 +590,15 @@ def _compile(self):
 
             logger.info(f"Compiling the {self._model_name} to {self.device} with config {self.ov_config} ... ")
             self.compiled_model = core.compile_model(self.model, self.device, self.ov_config)
+            self.request = self.compiled_model  # Deprecated attribute, use compiled_model instead
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self.device} SUPPORTED_PROPERTIES:")
                 _print_compiled_model_properties(self.compiled_model)
 
     def create_infer_request(self):
-        if self.request is None:
-            self.request = self.compiled_model.create_infer_request()
+        if self.infer_request is None:
+            self.infer_request = self.compiled_model.create_infer_request()
 
     def clone(self):
         model_cloned = self.__class__(self.model, self.parent_model, ov_config=self.ov_config)
@@ -627,9 +629,9 @@ def __call__(self, input_ids: np.ndarray):
         inputs = {
             "input_ids": input_ids,
         }
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        outputs = [self.request.get_tensor(output).data for output in self.request.results]
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        outputs = [self.infer_request.get_tensor(output).data for output in self.infer_request.results]
         return outputs
 
 
@@ -664,9 +666,9 @@ def __call__(
         if timestep_cond is not None:
             inputs["timestep_cond"] = timestep_cond
 
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        outputs = [self.request.get_tensor(output).data for output in self.request.results]
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        outputs = [self.infer_request.get_tensor(output).data for output in self.infer_request.results]
         return outputs
 
 
@@ -683,9 +685,9 @@ def __call__(self, latent_sample: np.ndarray):
         inputs = {
             "latent_sample": latent_sample,
         }
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        outputs = [self.request.results[output].data for output in self.request.results]
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        outputs = [self.infer_request.results[output].data for output in self.infer_request.results]
         return outputs
 
     def _compile(self):
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 1c58903d23..98b40866a6 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -419,7 +419,8 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se
         self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
         self.main_input_name = self.parent_model.main_input_name or "input_ids"
         self.compiled_model = None
-        self.request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
+        self.infer_request = None
 
     @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING)
     def forward(
@@ -461,6 +462,7 @@ def _compile(self):
         if self.compiled_model is None:
             logger.info(f"Compiling the encoder to {self._device} ...")
             self.compiled_model = core.compile_model(self.model, self._device, ov_config)
+            self.request = self.compiled_model  # Deprecated attribute, use compiled_model instead
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
@@ -496,8 +498,9 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se
             self.use_past = False
             self.num_pkv = 4
 
-        self.request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
         self.compiled_model = None
+        self.infer_request = None
 
     @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING)
     def forward(

From c971473692d984e2f0f6519d2aa57820f5f77e30 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Fri, 16 Feb 2024 16:50:11 +0100
Subject: [PATCH 13/15] drop not needed tests

---
 tests/openvino/gen_batch.py   | 82 -----------------------------------
 tests/openvino/gen_img.py     | 63 ---------------------------
 tests/openvino/gen_seq2seq.py | 51 ----------------------
 3 files changed, 196 deletions(-)
 delete mode 100644 tests/openvino/gen_batch.py
 delete mode 100644 tests/openvino/gen_img.py
 delete mode 100644 tests/openvino/gen_seq2seq.py

diff --git a/tests/openvino/gen_batch.py b/tests/openvino/gen_batch.py
deleted file mode 100644
index 38ecd6d6b2..0000000000
--- a/tests/openvino/gen_batch.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import threading
-from datetime import datetime
-
-from transformers import AutoConfig, AutoTokenizer, set_seed
-
-from optimum.intel import OVModelForCausalLM
-
-
-set_seed(10)
-model_path = "/model"
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-tokenizer.pad_token = "[PAD]"
-tokenizer.padding_side = "left"
-NUM_THREADS = 3
-prompt1 = [" The weather is "]
-prompt2 = [" Openvino is a ", " What the the relativity theory "]
-prompt3 = [
-    " Are cats smarter that dogs ",
-    " How big is an elephant ",
-    " the water in the ocean is much hotter than before  ",
-]
-prompts = [prompt1, prompt2, prompt3]
-
-OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": "", "NUM_STREAMS": "1"}
-model = OVModelForCausalLM.from_pretrained(
-    model_path,
-    config=AutoConfig.from_pretrained(model_path, trust_remote_code=True),
-    ov_config=OV_CONFIG,
-    compile=True,
-)
-
-threads = [None] * NUM_THREADS
-results = [None] * NUM_THREADS
-
-
-def print_response(t, p, r):
-    print("THREAD", t)
-    print("PROMPT:", p)
-    for answer in r:
-        print("Answer:")
-        print(tokenizer.decode(answer, skip_special_tokens=True))
-
-
-def gen_thread(prompt, results, i):
-    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
-    generate_kwargs = {
-        "input_ids": inputs.input_ids,
-        "max_new_tokens": 200,
-        "temperature": 1.0,
-        "do_sample": True,
-        "top_p": 1.0,
-        "top_k": 50,
-        "num_beams": 5,
-        "repetition_penalty": 1.1,
-    }
-    start = datetime.now()
-    model_exec = model.clone()
-    end = datetime.now()
-    print("cloning model duration", (end - start).total_seconds() * 1000000, "us")
-    outputs = model_exec.generate(**generate_kwargs)
-    num_tok = 0
-    for x in range(len(prompt)):
-        num_tok += outputs[x].numel() - inputs.get("input_ids")[x].numel()
-    results[i] = outputs, num_tok
-
-
-start = datetime.now()
-for i in range(len(threads)):
-    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
-    threads[i].start()
-
-total_tok = 0
-for i in range(len(threads)):
-    threads[i].join()
-    total_tok += results[i][1]
-end = datetime.now()
-
-for i in range(len(threads)):
-    print_response(i, prompts[i], results[i][0])
-
-print("Generation time [s]", ((end - start).total_seconds()), "tokens:", total_tok)
-print("Throughput:", total_tok * 60 / ((end - start).total_seconds()), "tokens/min")
diff --git a/tests/openvino/gen_img.py b/tests/openvino/gen_img.py
deleted file mode 100644
index 1fe61b2239..0000000000
--- a/tests/openvino/gen_img.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import datetime
-import threading
-
-from diffusers import DDIMScheduler
-
-from optimum.intel.openvino import OVStableDiffusionPipeline
-
-
-MODEL_PATH = "/model"
-OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"}
-
-
-pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_PATH, device="GPU", ov_config=OV_CONFIG, compile=True, dynamic_shapes=True)
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-
-vae_decoder_clon = pipe.vae_decoder.clone()
-unet_clon = pipe.unet.clone()
-
-prompt1 = [" Zebras in space "]
-prompt2 = [" The statue of liberty in New York", " Big Ben in London "]
-prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"]
-
-prompts = [prompt1, prompt2, prompt3]
-
-NUM_THREADS = 3
-
-threads = [None] * NUM_THREADS
-results = [None] * NUM_THREADS
-
-
-def save_response(t, p, r):
-    print("THREAD", t)
-    print("PROMPT:", p)
-    for i in range(len(r)):
-        print("IMG:", i)
-        r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG")
-
-
-def gen_thread(prompt, results, i):
-    start = datetime.datetime.now()
-    pipe_exec = pipe.clone()
-    end = datetime.datetime.now()
-    print("Clonning time [s]", ((end - start).total_seconds()))
-    text = prompt
-    images = pipe_exec(text).images
-    results[i] = images
-
-
-start = datetime.datetime.now()
-for i in range(len(threads)):
-    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
-    threads[i].start()
-nu_img = 0
-for i in range(len(threads)):
-    threads[i].join()
-    nu_img += len(results[i])
-end = datetime.datetime.now()
-
-for i in range(len(threads)):
-    save_response(i, prompts[i], results[i])
-
-print("Generation time [s]", ((end - start).total_seconds()), "images:", nu_img)
-print("Throughput:", nu_img * 60 / ((end - start).total_seconds()), "images/min")
diff --git a/tests/openvino/gen_seq2seq.py b/tests/openvino/gen_seq2seq.py
deleted file mode 100644
index 27d3ed2a45..0000000000
--- a/tests/openvino/gen_seq2seq.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import datetime
-import threading
-
-from transformers import AutoTokenizer, pipeline
-
-from optimum.intel import OVModelForSeq2SeqLM
-
-
-model_id = "echarlaix/t5-small-openvino"
-model = OVModelForSeq2SeqLM.from_pretrained(model_id)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
-
-prompt1 = ["I live in Europe"]
-prompt2 = ["What is your name?", "The dog is very happy"]
-prompt3 = ["It's a beautiful weather today", "Yes", "Good morning"]
-prompts = [prompt1, prompt2, prompt3]
-
-NUM_THREADS = 3
-
-threads = [None] * NUM_THREADS
-results = [None] * NUM_THREADS
-
-
-def print_response(t, p, r):
-    print("THREAD", t)
-    print("PROMPT:", p)
-    for i in range(len(r)):
-        print("TRANSLATION", i, ":", r[i]["translation_text"])
-
-
-def gen_thread(prompt, results, i):
-    translations = pipe(prompt)
-    results[i] = translations
-
-
-start = datetime.datetime.now()
-for i in range(len(threads)):
-    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
-    threads[i].start()
-nu_trans = 0
-for i in range(len(threads)):
-    threads[i].join()
-    nu_trans += len(results[i])
-end = datetime.datetime.now()
-
-for i in range(len(threads)):
-    print_response(i, prompts[i], results[i])
-
-print("Generation time [s]", ((end - start).total_seconds()), "translations:", nu_trans)
-print("Throughput:", nu_trans / ((end - start).total_seconds()), "translations/s")

From cbdb3044a07cbf758057437fbdec75e2f33eeb57 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Fri, 16 Feb 2024 17:06:51 +0100
Subject: [PATCH 14/15] style fix

---
 examples/openvino/multithreading/gen_image.py | 4 +++-
 examples/openvino/multithreading/gen_text.py  | 4 +++-
 optimum/intel/openvino/modeling_base.py       | 6 +++---
 tests/openvino/test_stable_diffusion.py       | 4 +---
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/openvino/multithreading/gen_image.py b/examples/openvino/multithreading/gen_image.py
index ffa392f26e..21f8737445 100644
--- a/examples/openvino/multithreading/gen_image.py
+++ b/examples/openvino/multithreading/gen_image.py
@@ -9,7 +9,7 @@
 
 
 pipe = OVStableDiffusionPipeline.from_pretrained(
-    MODEL_PATH,  ov_config=OV_CONFIG, compile=True, dynamic_shapes=True, export=True
+    MODEL_PATH, ov_config=OV_CONFIG, compile=True, dynamic_shapes=True, export=True
 )
 
 vae_decoder_clon = pipe.vae_decoder.clone()
@@ -34,6 +34,7 @@ def save_response(t, p, r):
         print("IMG:", i)
         r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG")
 
+
 def gen_thread(prompt, results, i):
     start = datetime.datetime.now()
     pipe_exec = pipe.clone()
@@ -43,6 +44,7 @@ def gen_thread(prompt, results, i):
     images = pipe_exec(text).images
     results[i] = images
 
+
 start = datetime.datetime.now()
 for i in range(len(threads)):
     threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
diff --git a/examples/openvino/multithreading/gen_text.py b/examples/openvino/multithreading/gen_text.py
index 2fed55276e..717f20cfc6 100644
--- a/examples/openvino/multithreading/gen_text.py
+++ b/examples/openvino/multithreading/gen_text.py
@@ -1,6 +1,8 @@
 import threading
 from datetime import datetime
+
 from transformers import AutoConfig, AutoTokenizer, set_seed
+
 from optimum.intel import OVModelForCausalLM
 
 
@@ -25,7 +27,7 @@
     config=AutoConfig.from_pretrained(model_id, trust_remote_code=True),
     ov_config=OV_CONFIG,
     compile=True,
-    export=True
+    export=True,
 )
 
 threads = [None] * NUM_THREADS
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 66737f68f3..0892736e07 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -81,7 +81,7 @@ def __init__(
             output_names[next((name for name in names if "/" not in name), names[0])] = idx
         self.output_names = output_names
         self.model = model
-        self.request = None # Deprecated attribute, use compiled_model instead
+        self.request = None  # Deprecated attribute, use compiled_model instead
         self.infer_request = None
         self.async_exec = False
         self.compiled_model = None
@@ -349,7 +349,7 @@ def compile(self):
                 ov_config["CACHE_DIR"] = str(cache_dir)
                 logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}")
             self.compiled_model = core.compile_model(self.model, self._device, ov_config)
-            self.request = self.compiled_model # Deprecated attribute, use compiled_model instead
+            self.request = self.compiled_model  # Deprecated attribute, use compiled_model instead
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
@@ -396,7 +396,7 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid
         self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False
         self.model = self._reshape(self.model, batch_size, sequence_length, height, width)
         self.compiled_model = None
-        self.infer_request = None 
+        self.infer_request = None
         self.request = None  # Deprecated attribute, use compiled_model instead
         return self
 
diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py
index c808397312..65044339f2 100644
--- a/tests/openvino/test_stable_diffusion.py
+++ b/tests/openvino/test_stable_diffusion.py
@@ -271,9 +271,7 @@ def test_compare_to_diffusers_multithreading(self, model_arch: str):
         pipeline.safety_checker = None
         batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64
 
-        def run_ov_model(
-            prompt, ov_pipeline
-        ):
+        def run_ov_model(prompt, ov_pipeline):
             ov_pipeline_instance = ov_pipeline.clone()
             latents = ov_pipeline_instance.prepare_latents(
                 batch_size * num_images_per_prompt,

From 30437ae38d2d527b46b980c3f09be884646704f9 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Tue, 20 Feb 2024 12:29:50 +0100
Subject: [PATCH 15/15] fix tests without gpu

---
 tests/openvino/test_modeling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 40027b2f18..84e33d7f7b 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -576,6 +576,8 @@ def test_compare_to_transformers(self, model_arch):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers_multithreading(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
+        if "llama_gptq" in model_arch:
+            self.skipTest("Not supported without gpu and disable_exllama=True option")
         set_seed(SEED)
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
         self.assertIsInstance(ov_model.config, PretrainedConfig)