From 03d2f517c5b206cfc3dcc8db6a03fb97404b7e73 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 24 Dec 2024 19:15:39 +0400
Subject: [PATCH] update qwen2vl to use optimum intel (#2616)

---
 .ci/spellcheck/.pyspelling.wordlist.txt   |   1 +
 notebooks/phi-3-vision/phi-3-vision.ipynb |   4 +-
 notebooks/qwen2-vl/README.md              |   2 +-
 notebooks/qwen2-vl/qwen2-vl.ipynb         | 490 ++++++----------------
 4 files changed, 138 insertions(+), 359 deletions(-)

diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
index 08aafa75159..c15fcf4515c 100644
--- a/.ci/spellcheck/.pyspelling.wordlist.txt
+++ b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -629,6 +629,7 @@ OV
 OVC
 OVModel
 OVModelForCausalLM
+OVModelForVisualCausalLM
 OVModelForXXX
 OVModelForXxx
 OVMS
diff --git a/notebooks/phi-3-vision/phi-3-vision.ipynb b/notebooks/phi-3-vision/phi-3-vision.ipynb
index 2414e247d49..c977f39b36e 100644
--- a/notebooks/phi-3-vision/phi-3-vision.ipynb
+++ b/notebooks/phi-3-vision/phi-3-vision.ipynb
@@ -166,9 +166,9 @@
     "## Convert and Optimize model\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "Phi-3-vision is PyTorch model. OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate Representation (IR). [OpenVINO model conversion API](https://docs.openvino.ai/2024/openvino-workflow/model-preparation.html#convert-a-model-with-python-convert-model) should be used for these purposes. `ov.convert_model` function accepts original PyTorch model instance and example input for tracing and returns `ov.Model` representing this model in OpenVINO framework. Converted model can be used for saving on disk using `ov.save_model` function or directly loading on device using `core.complie_model`. \n",
+    "Phi-3-vision is PyTorch model. OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate Representation (IR). [OpenVINO model conversion API](https://docs.openvino.ai/2024/openvino-workflow/model-preparation.html#convert-a-model-with-python-convert-model) should be used for these purposes. `ov.convert_model` function accepts original PyTorch model instance and example input for tracing and returns `ov.Model` representing this model in OpenVINO framework. Converted model can be used for saving on disk using `ov.save_model` function or directly loading on device using `core.compile_model`. \n",
     "\n",
-    "OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate Representation format.  For convenience, we will use OpenVINO integration with HuggingFace Optimum. 🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.\n",
+    "For convenience, we will use OpenVINO integration with HuggingFace Optimum. 🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.\n",
     "\n",
     "Among other use cases, Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. `optimum-cli` provides command line interface for model conversion and optimization. \n",
     "\n",
diff --git a/notebooks/qwen2-vl/README.md b/notebooks/qwen2-vl/README.md
index 084d190d9d0..6c51b679590 100644
--- a/notebooks/qwen2-vl/README.md
+++ b/notebooks/qwen2-vl/README.md
@@ -27,7 +27,7 @@ Qwen2VL is the latest addition to the QwenVL series of multimodal large language
 
 More details about model can be found in [model card](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct), [blog](https://qwenlm.github.io/blog/qwen2-vl/) and original [repo](https://github.com/QwenLM/Qwen2-VL).
 
-In this tutorial we consider how to convert and optimize Qwen2VL model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)
+In this tutorial we consider how to convert and optimize Qwen2VL model for creating multimodal chatbot using [Optimum Intel](https://github.com/huggingface/optimum-intel). Additionally, we demonstrate how to apply model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)
 
 ## Notebook contents
 The tutorial consists from following steps:
diff --git a/notebooks/qwen2-vl/qwen2-vl.ipynb b/notebooks/qwen2-vl/qwen2-vl.ipynb
index fbc1d76e7ba..046f34c090b 100644
--- a/notebooks/qwen2-vl/qwen2-vl.ipynb
+++ b/notebooks/qwen2-vl/qwen2-vl.ipynb
@@ -34,7 +34,7 @@
     "\n",
     "More details about model can be found in [model card](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct), [blog](https://qwenlm.github.io/blog/qwen2-vl/) and original [repo](https://github.com/QwenLM/Qwen2-VL).\n",
     "\n",
-    "In this tutorial we consider how to convert and optimize Qwen2VL model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)\n",
+    "In this tutorial we consider how to convert and optimize Qwen2VL model for creating multimodal chatbot using [Optimum Intel](https://github.com/huggingface/optimum-intel). Additionally, we demonstrate how to apply model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)\n",
     "#### Table of contents:\n",
     "\n",
     "- [Prerequisites](#Prerequisites)\n",
@@ -73,7 +73,8 @@
    "outputs": [],
    "source": [
     "%pip install -q \"transformers>=4.45\" \"torch>=2.1\" \"torchvision\" \"qwen-vl-utils\" \"Pillow\" \"gradio>=4.36\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
-    "%pip install -qU \"openvino>=2024.4.0\" \"nncf>=2.13.0\""
+    "%pip install  -q -U \"openvino>=2024.6.0\" \"openvino-tokenizrs>=2024.6.0\" \"nncf>=2.14.0\"\n",
+    "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu"
    ]
   },
   {
@@ -85,9 +86,9 @@
     "from pathlib import Path\n",
     "import requests\n",
     "\n",
-    "if not Path(\"ov_qwen2_vl.py\").exists():\n",
-    "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/ov_qwen2_vl.py\")\n",
-    "    open(\"ov_qwen2_vl.py\", \"w\").write(r.text)\n",
+    "if not Path(\"cmd_helper.py\").exists():\n",
+    "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py\")\n",
+    "    open(\"cmd_helper.py\", \"w\").write(r.text)\n",
     "\n",
     "if not Path(\"notebook_utils.py\").exists():\n",
     "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n",
@@ -110,30 +111,10 @@
    "execution_count": 3,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-10-30 21:02:36.765098: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-10-30 21:02:36.777073: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
-      "E0000 00:00:1730307756.791531  559916 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "E0000 00:00:1730307756.795971  559916 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2024-10-30 21:02:36.810854: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
-     ]
-    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6b39e80a0c7d4910ae9e186f28693c6b",
+       "model_id": "9b6a9892f0e842168ac2c1290377a6b1",
        "version_major": 2,
        "version_minor": 0
       },
@@ -147,9 +128,15 @@
     }
    ],
    "source": [
-    "from ov_qwen2_vl import model_selector\n",
+    "import ipywidgets as widgets\n",
     "\n",
-    "model_id = model_selector()\n",
+    "model_ids = [\"Qwen/Qwen2-VL-2B-Instruct\", \"Qwen/Qwen2-VL-7B-Instruct\"]\n",
+    "\n",
+    "model_id = widgets.Dropdown(\n",
+    "    options=model_ids,\n",
+    "    default=model_ids[0],\n",
+    "    description=\"Model:\",\n",
+    ")\n",
     "\n",
     "model_id"
    ]
@@ -163,7 +150,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Selected Qwen/Qwen2-VL-7B-Instruct\n"
+      "Selected Qwen/Qwen2-VL-2B-Instruct\n"
      ]
     }
    ],
@@ -182,20 +169,18 @@
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
     "Qwen2VL is PyTorch model. OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate Representation (IR). [OpenVINO model conversion API](https://docs.openvino.ai/2024/openvino-workflow/model-preparation.html#convert-a-model-with-python-convert-model) should be used for these purposes. `ov.convert_model` function accepts original PyTorch model instance and example input for tracing and returns `ov.Model` representing this model in OpenVINO framework. Converted model can be used for saving on disk using `ov.save_model` function or directly loading on device using `core.compile_model`. \n",
-    "`ov_qwen2_vl.py` script contains helper function for model conversion, please check its content if you interested in conversion details.\n",
     "\n",
-    "<details>\n",
-    "  <summary><b>Click here for more detailed explanation of conversion steps</b></summary>\n",
-    "Qwen2VL is autoregressive transformer generative model, it means that each next model step depends from model output from previous step. The generation approach is based on the assumption that the probability distribution of a word sequence can be decomposed into the product of conditional next word distributions. In other words, model predicts the next token in the loop guided by previously generated tokens until the stop-condition will be not reached (generated sequence of maximum length or end of string token obtained). The way the next token will be selected over predicted probabilities is driven by the selected decoding methodology. You can find more information about the most popular decoding methods in this <a href=\"https://huggingface.co/blog/how-to-generate\">blog</a>. The entry point for the generation process for models from the Hugging Face Transformers library is the `generate` method. You can find more information about its parameters and configuration in the  <a href=\"https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/text_generation#transformers.GenerationMixin.generate\">documentation</a>. To preserve flexibility in the selection decoding methodology, we will convert only model inference for one step.\n",
+    "For convenience, we will use OpenVINO integration with HuggingFace Optimum. 🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.\n",
+    "\n",
+    "Among other use cases, Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. `optimum-cli` provides command line interface for model conversion and optimization. \n",
     "\n",
-    "The inference flow has difference on first step and for the next. On the first step, model accept preprocessed input instruction and image, that transformed to the unified embedding space using `input_embedding` and `image_encoder` models, after that `language model`, LLM-based part of model, runs on input embeddings to predict probability of next generated tokens. On the next step, `language_model` accepts only next token id selected based on sampling strategy and processed by `input_embedding` model and cached attention key and values.  Since the output side is auto-regressive, an output token hidden state remains the same once computed for every further generation step. Therefore, recomputing it every time you want to generate a new token seems wasteful. With the cache, the model saves the hidden state once it has been computed. The model only computes the one for the most recently generated output token at each time step, re-using the saved ones for hidden tokens. This reduces the generation complexity from $O(n^3)$ to $O(n^2)$ for a transformer model. More details about how it works can be found in this [article](https://scale.com/blog/pytorch-improvements#Text%20Translation).\n",
-    "To sum up above, model consists of 4 parts:\n",
+    "General command format:\n",
     "\n",
-    "* **Image encoder** for encoding input images into embedding space.\n",
-    "* **Input Embedding** for conversion input text tokens into embedding space\n",
-    "* **Language Model** for generation answer based on input embeddings provided by Image Encoder and Input Embedding models.\n",
+    "```bash\n",
+    "optimum-cli export openvino --model <model_id_or_path> --task <task> <output_dir>\n",
+    "```\n",
     "\n",
-    "</details>\n",
+    "where task is task to export the model for, if not specified, the task will be auto-inferred based on the model. You can find a mapping between tasks and model classes in Optimum TaskManager [documentation](https://huggingface.co/docs/optimum/exporters/task_manager). Additionally, you can specify weights compression using `--weight-format` argument with one of following options: `fp32`, `fp16`, `int8` and `int4`. Fro int8 and int4 [nncf](https://github.com/openvinotoolkit/nncf) will be used for  weight compression. More details about model export provided in [Optimum Intel documentation](https://huggingface.co/docs/optimum/intel/openvino/export#export-your-model).\n",
     "\n",
     "\n",
     "### Compress model weights to 4-bit\n",
@@ -222,174 +207,99 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ov_qwen2_vl import convert_qwen2vl_model\n",
-    "\n",
-    "# uncomment these lines to see model conversion code\n",
-    "# convert_qwen2vl_model??"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "⌛ Qwen/Qwen2-VL-7B-Instruct conversion started. Be patient, it may takes some time.\n",
-      "⌛ Load Original model\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
-     ]
-    },
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "482df1c168634a809d027d3ad3dfd052",
-       "version_major": 2,
-       "version_minor": 0
-      },
+      "text/markdown": [
+       "**Export command:**"
+      ],
       "text/plain": [
-       "Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]"
+       "<IPython.core.display.Markdown object>"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "✅ Original model successfully loaded\n",
-      "⌛ Convert Input embedding model\n",
-      "WARNING:nncf:NNCF provides best results with torch==2.4.*, while current torch version is 2.5.1+cu124. If you encounter issues, consider switching to torch==2.4.*\n",
-      "✅ Input embedding model successfully converted\n",
-      "⌛ Convert Language model\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/modeling_utils.py:4779: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n",
-      "  warnings.warn(\n",
-      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:447: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.\n",
-      "  or len(self.key_cache[layer_idx]) == 0  # the layer has no cache\n",
-      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py:477: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if sequence_length != 1:\n",
-      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:432: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.\n",
-      "  elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "✅ Language model successfully converted\n",
-      "⌛ Weights compression with int4_asym mode started\n",
-      "INFO:nncf:Statistics of the bitwidth distribution:\n",
-      "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
-      "│   Num bits (N) │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
-      "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
-      "│              8 │ 8% (1 / 197)                │ 0% (0 / 196)                           │\n",
-      "├────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n",
-      "│              4 │ 92% (196 / 197)             │ 100% (196 / 196)                       │\n",
-      "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n"
-     ]
-    },
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4d985188bdfa4bb9b4edd6f16f27f7f6",
-       "version_major": 2,
-       "version_minor": 0
-      },
+      "text/markdown": [
+       "`optimum-cli export openvino --model Qwen/Qwen2-VL-2B-Instruct Qwen2-VL-2B-Instruct/INT4 --weight-format int4`"
+      ],
       "text/plain": [
-       "Output()"
+       "<IPython.core.display.Markdown object>"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "✅ Weights compression finished\n",
-      "⌛ Convert Image embedding model\n",
-      "⌛ Weights compression with int4_asym mode started\n",
-      "INFO:nncf:Statistics of the bitwidth distribution:\n",
-      "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
-      "│   Num bits (N) │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
-      "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
-      "│              8 │ 3% (1 / 130)                │ 0% (0 / 129)                           │\n",
-      "├────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n",
-      "│              4 │ 97% (129 / 130)             │ 100% (129 / 129)                       │\n",
-      "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n"
+      "2024-12-24 18:27:51.174286: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-12-24 18:27:51.186686: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
+      "E0000 00:00:1735050471.201093  340500 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "E0000 00:00:1735050471.205249  340500 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2024-12-24 18:27:51.219846: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "Downloading shards: 100%|██████████| 2/2 [00:00<00:00,  2.73it/s]\n",
+      "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n",
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.46s/it]\n",
+      "`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.\n",
+      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.\n",
+      "  or len(self.key_cache[layer_idx]) == 0  # the layer has no cache\n",
+      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/modeling_attn_mask_utils.py:281: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  elif sliding_window is None or key_value_length < sliding_window:\n",
+      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py:1329: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if attention_mask.shape[-1] > target_length:\n",
+      "/home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.\n",
+      "  elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors\n"
      ]
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e7e227188f8041f2a446ff3a1159261c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Output()"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Weights compression finished\n",
-      "✅ Image embedding model successfully converted\n",
-      "✅ Qwen/Qwen2-VL-7B-Instruct model conversion finished. You can find results in Qwen2-VL-7B-Instruct\n"
+      "INFO:nncf:Statistics of the bitwidth distribution:\n",
+      "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+      "│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
+      "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+      "│ int8_asym                 │ 15% (1 / 197)               │ 0% (0 / 196)                           │\n",
+      "├───────────────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n",
+      "│ int4_asym                 │ 85% (196 / 197)             │ 100% (196 / 196)                       │\n",
+      "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:45\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m;0;104;181m0:00:01\u001b[0m181m0:00:02\u001b[0m\n",
+      "\u001b[?25hINFO:nncf:Statistics of the bitwidth distribution:\n",
+      "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+      "│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
+      "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+      "│ int8_sym                  │ 100% (1 / 1)                │ 100% (1 / 1)                           │\n",
+      "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n",
+      "\u001b[?25hINFO:nncf:Statistics of the bitwidth distribution:\n",
+      "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+      "│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
+      "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+      "│ int8_sym                  │ 100% (1 / 1)                │ 100% (1 / 1)                           │\n",
+      "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:01\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n",
+      "\u001b[?25hINFO:nncf:Statistics of the bitwidth distribution:\n",
+      "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+      "│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
+      "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+      "│ int8_sym                  │ 100% (130 / 130)            │ 100% (130 / 130)                       │\n",
+      "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:03\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m02\u001b[0m • \u001b[38;2;0;104;181m0:00:01\u001b[0m\n",
+      "\u001b[?25h"
      ]
     }
    ],
    "source": [
-    "import nncf\n",
+    "from cmd_helper import optimum_cli\n",
     "\n",
-    "compression_configuration = {\n",
-    "    \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
-    "    \"group_size\": 128,\n",
-    "    \"ratio\": 1.0,\n",
-    "}\n",
-    "\n",
-    "convert_qwen2vl_model(pt_model_id, model_dir, compression_configuration)"
+    "if not (model_dir / \"INT4\").exists():\n",
+    "    optimum_cli(pt_model_id, model_dir / \"INT4\", additional_args={\"weight-format\": \"int4\"})"
    ]
   },
   {
@@ -400,19 +310,30 @@
     "## Prepare model inference pipeline\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "As discussed, the model comprises Image Encoder and LLM (with separated text embedding part) that generates answer. In `ov_qwen2_vl.py` we defined inference class `OVQwen2VLModel` that will represent generation cycle, It is based on [HuggingFace Transformers `GenerationMixin`](https://huggingface.co/docs/transformers/main_classes/text_generation) and looks similar to [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) `OVModelForCausalLM` that is used for LLM inference. "
+    "OpenVINO integration with Optimum Intel provides ready-to-use API for model inference that can be used for smooth integration with transformers-based solutions. For loading model, we will use `OVModelForVisualCausalLM` class that have compatible interface with Transformers LLaVA implementation. For loading a model, `from_pretrained` method should be used. It accepts path to the model directory or model_id from HuggingFace hub (if model is not converted to OpenVINO format, conversion will be triggered automatically). Additionally, we can provide an inference device, quantization config (if model has not been quantized yet) and device-specific OpenVINO Runtime configuration. More details about model inference with Optimum Intel can be found in [documentation](https://huggingface.co/docs/optimum/intel/openvino/inference)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-12-24 18:30:03.136274: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-12-24 18:30:03.148865: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
+      "E0000 00:00:1735050603.163311  340474 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "E0000 00:00:1735050603.167677  340474 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2024-12-24 18:30:03.182551: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
    "source": [
-    "from ov_qwen2_vl import OVQwen2VLModel\n",
-    "\n",
-    "# Uncomment below lines to see the model inference class code\n",
-    "# OVQwen2VLModel??"
+    "from optimum.intel.openvino import OVModelForVisualCausalLM"
    ]
   },
   {
@@ -426,13 +347,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "df142541e95b4107b44ee9353f5e503a",
+       "model_id": "f75cab4cce234b378c0f9c5713e8202a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -440,7 +361,7 @@
        "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -457,9 +378,18 @@
    "cell_type": "code",
    "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not infer whether the model was already converted or not to the OpenVINO IR, keeping `export=AUTO`.\n",
+      "unsupported operand type(s) for ^: 'bool' and 'str'\n"
+     ]
+    }
+   ],
    "source": [
-    "model = OVQwen2VLModel(model_dir, device.value)"
+    "model = OVModelForVisualCausalLM.from_pretrained(model_dir / \"INT4\", device.value)"
    ]
   },
   {
@@ -473,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {
     "tags": []
    },
@@ -489,13 +419,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -503,7 +426,7 @@
       "Question:\n",
       "Describe this image.\n",
       "Answer:\n",
-      "The image depicts a serene beach scene at sunset. A woman and her dog are sitting on the sandy shore, enjoying each other's company. The woman is wearing a plaid shirt and has long hair. She is holding the dog's paw, and the dog is wearing a colorful harness. The dog appears to be a large breed, possibly a Labrador Retriever. The ocean is visible in the background, with gentle waves and a clear sky. The sun is setting, casting a warm glow over\n"
+      "The image depicts a woman sitting on a sandy beach with a large dog. The dog is wearing a harness and is sitting on its hind legs, reaching up to give a high-five to the woman. The woman is smiling and appears to be enjoying the moment. The background shows the ocean with gentle waves, and the sky is clear with a soft light, suggesting it might be either sunrise or sunset.\n"
      ]
     }
    ],
@@ -516,7 +439,7 @@
     "\n",
     "min_pixels = 256 * 28 * 28\n",
     "max_pixels = 1280 * 28 * 28\n",
-    "processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)\n",
+    "processor = AutoProcessor.from_pretrained(model_dir / \"INT4\", min_pixels=min_pixels, max_pixels=max_pixels)\n",
     "\n",
     "if processor.chat_template is None:\n",
     "    tok = AutoTokenizer.from_pretrained(model_dir)\n",
@@ -565,7 +488,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {
     "tags": []
    },
@@ -647,13 +570,13 @@
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {
-     "0a0a67436c09405a96c4c528eb3b567d": {
+     "31e4430151654c27ab36860fd5e4a4d6": {
       "model_module": "@jupyter-widgets/base",
       "model_module_version": "2.0.0",
       "model_name": "LayoutModel",
       "state": {}
      },
-     "1ba257658c1a426c86e7ea47b1d0b24d": {
+     "3eaf3559d14d4d929f7aef2f3eb18eba": {
       "model_module": "@jupyter-widgets/controls",
       "model_module_version": "2.0.0",
       "model_name": "DescriptionStyleModel",
@@ -661,64 +584,7 @@
        "description_width": ""
       }
      },
-     "3d1f8206e8434610b41dfe67808c3e3f": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "482df1c168634a809d027d3ad3dfd052": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_867aff62ced0486796b586c405175543",
-        "IPY_MODEL_ca5bb90900474e4c915db56466f0b901",
-        "IPY_MODEL_afde46b2d05742eeb36d29b5748deab4"
-       ],
-       "layout": "IPY_MODEL_0a0a67436c09405a96c4c528eb3b567d"
-      }
-     },
-     "4d985188bdfa4bb9b4edd6f16f27f7f6": {
-      "model_module": "@jupyter-widgets/output",
-      "model_module_version": "1.0.0",
-      "model_name": "OutputModel",
-      "state": {
-       "layout": "IPY_MODEL_7e96a0c1f95c473492390c49be4df004",
-       "outputs": [
-        {
-         "data": {
-          "text/html": "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Applying Weight Compression <span style=\"color: #729c1f; text-decoration-color: #729c1f\">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span style=\"color: #800080; text-decoration-color: #800080\">100%</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:04:03</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:00</span>\n</pre>\n",
-          "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:04:03\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n"
-         },
-         "metadata": {},
-         "output_type": "display_data"
-        }
-       ]
-      }
-     },
-     "4e3c73a27d954a848390032bf478f304": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "57f1bfcfb2f14051be1b593be91a1538": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "6b39e80a0c7d4910ae9e186f28693c6b": {
+     "9b6a9892f0e842168ac2c1290377a6b1": {
       "model_module": "@jupyter-widgets/controls",
       "model_module_version": "2.0.0",
       "model_name": "DropdownModel",
@@ -728,90 +594,26 @@
         "Qwen/Qwen2-VL-7B-Instruct"
        ],
        "description": "Model:",
-       "index": 1,
-       "layout": "IPY_MODEL_9aa0c70f4a4b43f78ef0cc0459bae6c0",
-       "style": "IPY_MODEL_d4ad73ce65e84ea3bb3c63b8c6323c35"
+       "index": 0,
+       "layout": "IPY_MODEL_ec8f62dd8a8440c3b38810df8e4e8526",
+       "style": "IPY_MODEL_3eaf3559d14d4d929f7aef2f3eb18eba"
       }
      },
-     "7635d38c01bd47cea7ba1ef264e30ec8": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "7941cc5ffde44b31b16e51b5eac07706": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "7e96a0c1f95c473492390c49be4df004": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "85defe60aba94ede94d1b7329f10f993": {
+     "d850a13d24cd4471bcd439711d8d1bfb": {
       "model_module": "@jupyter-widgets/controls",
       "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
+      "model_name": "DescriptionStyleModel",
       "state": {
        "description_width": ""
       }
      },
-     "867aff62ced0486796b586c405175543": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_e1c3bd6ce8184a4cbbcf80004a3ffb6e",
-       "style": "IPY_MODEL_3d1f8206e8434610b41dfe67808c3e3f",
-       "value": "Loading checkpoint shards: 100%"
-      }
-     },
-     "9aa0c70f4a4b43f78ef0cc0459bae6c0": {
+     "ec8f62dd8a8440c3b38810df8e4e8526": {
       "model_module": "@jupyter-widgets/base",
       "model_module_version": "2.0.0",
       "model_name": "LayoutModel",
       "state": {}
      },
-     "a39b4438d9de4661843c27b86c7e93db": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "afde46b2d05742eeb36d29b5748deab4": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_57f1bfcfb2f14051be1b593be91a1538",
-       "style": "IPY_MODEL_4e3c73a27d954a848390032bf478f304",
-       "value": " 5/5 [00:01&lt;00:00,  2.93it/s]"
-      }
-     },
-     "ca5bb90900474e4c915db56466f0b901": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_7635d38c01bd47cea7ba1ef264e30ec8",
-       "max": 5,
-       "style": "IPY_MODEL_85defe60aba94ede94d1b7329f10f993",
-       "value": 5
-      }
-     },
-     "d4ad73ce65e84ea3bb3c63b8c6323c35": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "df142541e95b4107b44ee9353f5e503a": {
+     "f75cab4cce234b378c0f9c5713e8202a": {
       "model_module": "@jupyter-widgets/controls",
       "model_module_version": "2.0.0",
       "model_name": "DropdownModel",
@@ -822,32 +624,8 @@
        ],
        "description": "Device:",
        "index": 1,
-       "layout": "IPY_MODEL_a39b4438d9de4661843c27b86c7e93db",
-       "style": "IPY_MODEL_1ba257658c1a426c86e7ea47b1d0b24d"
-      }
-     },
-     "e1c3bd6ce8184a4cbbcf80004a3ffb6e": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "e7e227188f8041f2a446ff3a1159261c": {
-      "model_module": "@jupyter-widgets/output",
-      "model_module_version": "1.0.0",
-      "model_name": "OutputModel",
-      "state": {
-       "layout": "IPY_MODEL_7941cc5ffde44b31b16e51b5eac07706",
-       "outputs": [
-        {
-         "data": {
-          "text/html": "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Applying Weight Compression <span style=\"color: #729c1f; text-decoration-color: #729c1f\">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span style=\"color: #800080; text-decoration-color: #800080\">100%</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:20</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:00</span>\n</pre>\n",
-          "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:20\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n"
-         },
-         "metadata": {},
-         "output_type": "display_data"
-        }
-       ]
+       "layout": "IPY_MODEL_31e4430151654c27ab36860fd5e4a4d6",
+       "style": "IPY_MODEL_d850a13d24cd4471bcd439711d8d1bfb"
       }
      }
     },