From cd7408f61f26d38f5de65c8771cc3ec764f61fd0 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 23 Feb 2024 13:07:52 +0400
Subject: [PATCH] add gemma support in llm notebook (#1747)

---
 .ci/spellcheck/.pyspelling.wordlist.txt       |   2 +
 .../254-llm-chatbot/254-llm-chatbot.ipynb     | 376 +++++--
 .../254-llm-chatbot/254-rag-chatbot.ipynb     | 916 +-----------------
 notebooks/254-llm-chatbot/README.md           |  22 +-
 notebooks/254-llm-chatbot/config.py           |  16 +
 notebooks/254-llm-chatbot/converter.py        |  65 +-
 notebooks/254-llm-chatbot/ov_llm_model.py     |   1 +
 7 files changed, 436 insertions(+), 962 deletions(-)

diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
index 4c04711d655..bddce5cfa1c 100644
--- a/.ci/spellcheck/.pyspelling.wordlist.txt
+++ b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -232,6 +232,8 @@ Gb
 gcc
 GEC
 GELU
+Gemma
+gemma
 genai
 genAI
 Girshick
diff --git a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
index 80c115af44b..dd9f08539ff 100644
--- a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
+++ b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
@@ -63,12 +63,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001B[33mWARNING: Skipping openvino-dev as it is not installed.\u001B[0m\u001B[33m\n",
-      "\u001B[0m\u001B[33mWARNING: Skipping openvino as it is not installed.\u001B[0m\u001B[33m\n",
-      "\u001B[0mNote: you may need to restart the kernel to use updated packages.\n",
-      "\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.3.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
+      "\u001b[33mWARNING: Skipping openvino-dev as it is not installed.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\u001b[33mWARNING: Skipping openvino as it is not installed.\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n",
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
     }
@@ -77,12 +74,12 @@
     "%pip uninstall -q -y openvino-dev openvino openvino-nightly optimum optimum-intel\n",
     "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\\\n",
     "\"git+https://github.com/huggingface/optimum-intel.git\"\\\n",
-    "\"nncf>=2.8.0\"\\\n",
+    "\"git+https://github.com/openvinotoolkit/nncf.git\"\\\n",
     "\"datasets\" \\\n",
     "\"accelerate\"\\\n",
     "\"openvino-nightly\"\\\n",
     "\"gradio\"\\\n",
-    "\"onnx\" \"einops\" \"transformers_stream_generator\" \"tiktoken\" \"transformers>=4.37.0\" \"bitsandbytes\""
+    "\"onnx\" \"einops\" \"transformers_stream_generator\" \"tiktoken\" \"transformers>=4.38.1\" \"bitsandbytes\""
    ]
   },
   {
@@ -101,9 +98,41 @@
     "\n",
     "* **tiny-llama-1b-chat** - This is the chat model finetuned on top of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T). The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens with the adoption of the same architecture and tokenizer as Llama 2. This means TinyLlama can be plugged and played in many open-source projects built upon Llama. Besides, TinyLlama is compact with only 1.1B parameters. This compactness allows it to cater to a multitude of applications demanding a restricted computation and memory footprint. More details about model can be found in [model card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0)\n",
     "*  **mini-cpm-2b-dpo** - MiniCPM is an End-Size LLM developed by ModelBest Inc. and TsinghuaNLP, with only 2.4B parameters excluding embeddings. After Direct Preference Optimization (DPO) fine-tuning, MiniCPM outperforms many popular 7b, 13b and 70b models. More details can be found in [model_card](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp16).\n",
+    "*  **gemma-2b-it** - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights, pre-trained variants, and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. This model is instruction-tuned version of 2B parameters model. More details about model can be found in [model card](https://huggingface.co/google/gemma-2b-it).\n",
+    ">**Note**: run model with demo, you will need to accept license agreement. \n",
+    ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/google/gemma-2b-it), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n",
+    ">You can login on Hugging Face Hub in notebook environment, using following code:\n",
+    " \n",
+    "```python\n",
+    "    ## login to huggingfacehub to get access to pretrained model \n",
+    "\n",
+    "    from huggingface_hub import notebook_login, whoami\n",
+    "\n",
+    "    try:\n",
+    "        whoami()\n",
+    "        print('Authorization token already provided')\n",
+    "    except OSError:\n",
+    "        notebook_login()\n",
+    "```\n",
     "* **red-pajama-3b-chat** - A 2.8B parameter pre-trained language model based on GPT-NEOX architecture. It was developed by Together Computer and leaders from the open-source AI community. The model is fine-tuned on OASST1 and Dolly2 datasets to enhance chatting ability. More details about model can be found in [HuggingFace model card](https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1).\n",
+    "*  **gemma-7b-it** - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights, pre-trained variants, and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. This model is instruction-tuned version of 7B parameters model. More details about model can be found in [model card](https://huggingface.co/google/gemma-7b-it).\n",
+    ">**Note**: run model with demo, you will need to accept license agreement. \n",
+    ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/google/gemma-7b-it), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n",
+    ">You can login on Hugging Face Hub in notebook environment, using following code:\n",
+    " \n",
+    "```python\n",
+    "    ## login to huggingfacehub to get access to pretrained model \n",
+    "\n",
+    "    from huggingface_hub import notebook_login, whoami\n",
+    "\n",
+    "    try:\n",
+    "        whoami()\n",
+    "        print('Authorization token already provided')\n",
+    "    except OSError:\n",
+    "        notebook_login()\n",
+    "```\n",
+    "\n",
     "* **llama-2-7b-chat** - LLama 2 is the second generation of LLama models developed by Meta. Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. llama-2-7b-chat is 7 billions parameters version of LLama 2 finetuned and optimized for dialogue use case. More details about model can be found in the [paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/), [repository](https://github.com/facebookresearch/llama) and [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).\n",
-    "* **qwen1.5-7b-chat** - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. Qwen1.5 is a language model series including decoder language models of different model sizes. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention. You can find more details about model in the [model card](https://huggingface.co/Qwen/Qwen1.5-7B-Chat).\n",
     ">**Note**: run model with demo, you will need to accept license agreement. \n",
     ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n",
     ">You can login on Hugging Face Hub in notebook environment, using following code:\n",
@@ -119,6 +148,7 @@
     "    except OSError:\n",
     "        notebook_login()\n",
     "```\n",
+    "* **qwen1.5-7b-chat** - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. Qwen1.5 is a language model series including decoder language models of different model sizes. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention. You can find more details about model in the [model card](https://huggingface.co/Qwen/Qwen1.5-7B-Chat).\n",
     "* **mpt-7b-chat** - MPT-7B is part of the family of MosaicPretrainedTransformer (MPT) models, which use a modified transformer architecture optimized for efficient training and inference. These architectural changes include performance-optimized layer implementations and the elimination of context length limits by replacing positional embeddings with Attention with Linear Biases ([ALiBi](https://arxiv.org/abs/2108.12409)). Thanks to these modifications, MPT models can be trained with high throughput efficiency and stable convergence. MPT-7B-chat is a chatbot-like model for dialogue generation. It was built by finetuning MPT-7B on the [ShareGPT-Vicuna](https://huggingface.co/datasets/jeffwan/sharegpt_vicuna), [HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3), [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca), [HH-RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf), and [Evol-Instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k) datasets. More details about the model can be found in [blog post](https://www.mosaicml.com/blog/mpt-7b), [repository](https://github.com/mosaicml/llm-foundry/) and [HuggingFace model card](https://huggingface.co/mosaicml/mpt-7b-chat).\n",
     "* **chatglm3-6b** - ChatGLM3-6B is the latest open-source model in the ChatGLM series. While retaining many excellent features such as smooth dialogue and low deployment threshold from the previous two generations, ChatGLM3-6B employs a more diverse training dataset, more sufficient training steps, and a more reasonable training strategy. ChatGLM3-6B adopts a newly designed [Prompt format](https://github.com/THUDM/ChatGLM3/blob/main/PROMPT_en.md), in addition to the normal multi-turn dialogue. You can find more details about model in the [model card](https://huggingface.co/THUDM/chatglm3-6b)\n",
     "* **mistral-7b** - The Mistral-7B-v0.1 Large Language Model (LLM) is a pretrained generative text model with 7 billion parameters. You can find more details about model in the [model card](https://huggingface.co/mistralai/Mistral-7B-v0.1), [paper](https://arxiv.org/abs/2310.06825) and [release blog post](https://mistral.ai/news/announcing-mistral-7b/).\n",
@@ -131,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "f93282b6-f1f1-4153-84af-31aac79c3ef4",
    "metadata": {
     "tags": []
@@ -144,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "8d22fedb-d1f6-4306-b910-efac5b849c7c",
    "metadata": {
     "tags": []
@@ -153,15 +183,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a476e93e7ba54162bbe295af1be272b5",
+       "model_id": "765dcc7d2eff40a5a42d736b78aff037",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model:', options=('tiny-llama-1b-chat', 'minicpm-2b-dpo', 'red-pajama-3b-chat', 'llama-2…"
+       "Dropdown(description='Model:', index=4, options=('tiny-llama-1b-chat', 'minicpm-2b-dpo', 'gemma-2b-it', 'red-p…"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -171,7 +201,7 @@
     "\n",
     "model_id = widgets.Dropdown(\n",
     "    options=model_ids,\n",
-    "    value=model_ids[0],\n",
+    "    value=model_ids[4],\n",
     "    description=\"Model:\",\n",
     "    disabled=False,\n",
     ")\n",
@@ -181,7 +211,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 4,
    "id": "906022ec-96bf-41a9-9447-789d2e875250",
    "metadata": {
     "tags": []
@@ -191,7 +221,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Selected model baichuan2-7b-chat\n"
+      "Selected model gemma-7b-it\n"
      ]
     }
    ],
@@ -233,10 +263,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 5,
    "id": "8cd910c2",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-02-23 11:27:40.270271: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-02-23 11:27:40.272175: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2024-02-23 11:27:40.309339: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-02-23 11:27:40.953164: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "WARNING[XFORMERS]: xFormers can't load C++/CUDA extensions. xFormers was built for:\n",
+      "    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.2.0+cu121)\n",
+      "    Python  3.8.18 (you have 3.8.10)\n",
+      "  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)\n",
+      "  Memory-efficient attention, SwiGLU, sparse and more won't be available.\n",
+      "  Set XFORMERS_MORE_DETAILS=1 for more details\n",
+      "/home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
+      "  torch.utils._pytree._register_pytree_node(\n"
+     ]
+    }
+   ],
    "source": [
     "from transformers import AutoModelForCausalLM, AutoConfig\n",
     "from optimum.intel.openvino import OVModelForCausalLM\n",
@@ -283,7 +340,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 6,
    "id": "91eb2ccf",
    "metadata": {
     "collapsed": false,
@@ -295,7 +352,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3f21d0f5dc0c47a1874d474018861a33",
+       "model_id": "ce8099b73db84e03ae0c01afc5103f9e",
        "version_major": 2,
        "version_minor": 0
       },
@@ -309,7 +366,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6ed3c612007a41d2a83352dff6e1ad38",
+       "model_id": "7d08de53a6bc4e2792ddc4d857f91923",
        "version_major": 2,
        "version_minor": 0
       },
@@ -323,7 +380,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e65063e5107c470d8ca7f10babaad1fe",
+       "model_id": "077120b2d9f04a08953b597e5c0c1554",
        "version_major": 2,
        "version_minor": 0
       },
@@ -370,7 +427,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 7,
    "id": "c4ef9112",
    "metadata": {
     "collapsed": false,
@@ -378,28 +435,7 @@
      "outputs_hidden": false
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:transformers_modules.baichuan-inc.Baichuan2-7B-Chat.5e81b14f7baec6576b9318d5932059b5ce89262c.modeling_baichuan:Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\n",
-      "pip install xformers.\n",
-      "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
-      "  return self.fget.__get__(instance, owner)()\n",
-      "/home/ethan/.cache/huggingface/modules/transformers_modules/baichuan-inc/Baichuan2-7B-Chat/5e81b14f7baec6576b9318d5932059b5ce89262c/modeling_baichuan.py:348: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if input_shape[-1] > 1:\n",
-      "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/nncf/torch/dynamic_graph/wrappers.py:83: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  op1 = operator(*args, **kwargs)\n",
-      "/home/ethan/.cache/huggingface/modules/transformers_modules/baichuan-inc/Baichuan2-7B-Chat/5e81b14f7baec6576b9318d5932059b5ce89262c/modeling_baichuan.py:67: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if past_key_values_length > 0:\n",
-      "/home/ethan/.cache/huggingface/modules/transformers_modules/baichuan-inc/Baichuan2-7B-Chat/5e81b14f7baec6576b9318d5932059b5ce89262c/modeling_baichuan.py:122: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if seq_len > self.max_seq_len_cached:\n",
-      "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/torch/jit/_trace.py:154: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:486.)\n",
-      "  if a.grad is not None:\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from optimum.intel import OVWeightQuantizationConfig\n",
     "\n",
@@ -495,6 +531,11 @@
     "            \"group_size\": 64,\n",
     "            \"ratio\": 0.6,\n",
     "        },\n",
+    "        \"gemma-2b-it\": {\n",
+    "            \"sym\": True,\n",
+    "            \"group_size\": 64,\n",
+    "            \"ratio\": 0.6,\n",
+    "        },\n",
     "        \"notus-7b-v1\": {\n",
     "            \"sym\": True,\n",
     "            \"group_size\": 64,\n",
@@ -510,6 +551,11 @@
     "            \"group_size\": 128,\n",
     "            \"ratio\": 0.8,\n",
     "        },\n",
+    "        \"gemma-7b-it\": {\n",
+    "            \"sym\": True,\n",
+    "            \"group_size\": 128,\n",
+    "            \"ratio\": 0.8,\n",
+    "        },\n",
     "        \"chatglm2-6b\": {\n",
     "            \"sym\": True,\n",
     "            \"group_size\": 128,\n",
@@ -596,7 +642,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 8,
    "id": "281f1d07-998e-4e13-ba95-0264564ede82",
    "metadata": {},
    "outputs": [
@@ -604,7 +650,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Size of FP16 model is 14318.51 MB\n"
+      "Size of FP16 model is 16796.33 MB\n",
+      "Size of model with INT4 compressed weights is 5802.53 MB\n",
+      "Compression rate for INT4 model: 2.895\n"
      ]
     }
    ],
@@ -640,7 +688,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 9,
    "id": "837b4a3b-ccc3-4004-9577-2b2c7b802dea",
    "metadata": {
     "tags": []
@@ -649,15 +697,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6531e06b82754134b061d8fdf2813a16",
+       "model_id": "ee02a8c34bb0418eb1b80dcabae85481",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')"
+       "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU')"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -685,7 +733,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 10,
    "id": "5333ab9b-ff5d-4a7f-bcdc-9cca5d56dc0a",
    "metadata": {
     "tags": []
@@ -708,7 +756,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 11,
    "id": "3536a1a7",
    "metadata": {
     "collapsed": false,
@@ -720,15 +768,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "eb7388d721134e6c997cb24641048db6",
+       "model_id": "f0e1516d95504a05bbd9be972e27a32d",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model to run:', options=('FP16',), value='FP16')"
+       "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -754,7 +802,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 12,
    "id": "7a041101-7336-40fd-96c9-cd298015a0f3",
    "metadata": {
     "tags": []
@@ -764,7 +812,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading model from baichuan2-7b-chat/FP16\n"
+      "Loading model from gemma-7b-it/INT4_compressed_weights\n"
      ]
     },
     {
@@ -814,7 +862,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 13,
    "id": "8f6f7596-5677-4931-875b-aaabfa23cabc",
    "metadata": {},
    "outputs": [
@@ -881,7 +929,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 14,
    "id": "01f8f7f8-072e-45dc-b7c9-18d8c3c47754",
    "metadata": {
     "tags": []
@@ -891,7 +939,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running on local URL:  http://10.3.233.99:5587\n",
+      "Running on local URL:  http://127.0.0.1:7860\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      ]
@@ -899,7 +947,7 @@
     {
      "data": {
       "text/html": [
-       "<div><iframe src=\"http://10.3.233.99:5587/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -912,7 +960,7 @@
      "data": {
       "text/plain": []
      },
-     "execution_count": 29,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1286,7 +1334,6 @@
     "    )\n",
     "    clear.click(lambda: None, None, chatbot, queue=False)\n",
     "\n",
-    "demo.queue(max_size=2)\n",
     "# if you are launching remotely, specify server_name and server_port\n",
     "#  demo.launch(server_name='your server name', server_port='server port in int')\n",
     "# if you have any issue to launch on your platform, you can pass share=True to launch method:\n",
@@ -1297,21 +1344,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 15,
    "id": "7b837f9e-4152-4a5c-880a-ed874aa64a74",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Closing server running on port: 5587\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# please uncomment and run this cell for stopping gradio interface\n",
-    "demo.close()"
+    "# demo.close()"
    ]
   }
  ],
@@ -1331,7 +1370,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.8.10"
   },
   "openvino_notebooks": {
    "imageUrl": "https://user-images.githubusercontent.com/29454499/255799218-611e7189-8979-4ef5-8a80-5a75e0136b50.png",
@@ -1350,7 +1389,188 @@
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
-    "state": {},
+    "state": {
+     "077120b2d9f04a08953b597e5c0c1554": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxModel",
+      "state": {
+       "description": "Prepare FP16 model",
+       "disabled": false,
+       "layout": "IPY_MODEL_b9838a0579234ed0baddf8549fd2d493",
+       "style": "IPY_MODEL_5047e61ae74f4d2f80cf58ad0a00b46a",
+       "value": false
+      }
+     },
+     "1e9dc777f98642e6b9d4814673c5bf02": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "49c4796193264023ad2b89cd5483abb4": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "5047e61ae74f4d2f80cf58ad0a00b46a": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "6fd1d9c0d42a457cbf5f6fcf48e3f78a": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "765dcc7d2eff40a5a42d736b78aff037": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DropdownModel",
+      "state": {
+       "_options_labels": [
+        "tiny-llama-1b-chat",
+        "minicpm-2b-dpo",
+        "gemma-2b-it",
+        "red-pajama-3b-chat",
+        "gemma-7b-it",
+        "llama-2-chat-7b",
+        "mpt-7b-chat",
+        "qwen1.5-7b-chat",
+        "chatglm3-6b",
+        "mistral-7b",
+        "zephyr-7b-beta",
+        "neural-chat-7b-v3-1",
+        "notus-7b-v1",
+        "youri-7b-chat",
+        "baichuan2-7b-chat"
+       ],
+       "description": "Model:",
+       "index": 4,
+       "layout": "IPY_MODEL_bc0f92def39a4bd68ea67c6ee7e6ce9c",
+       "style": "IPY_MODEL_ed8d5d3e451f40a7a12820629c63fc28"
+      }
+     },
+     "7d08de53a6bc4e2792ddc4d857f91923": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxModel",
+      "state": {
+       "description": "Prepare INT8 model",
+       "disabled": false,
+       "layout": "IPY_MODEL_88168e39ff8c4539b2d72fc33c96f153",
+       "style": "IPY_MODEL_8b1f4ce889f840389dfcc8e852bd6b80",
+       "value": false
+      }
+     },
+     "88168e39ff8c4539b2d72fc33c96f153": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "8b1f4ce889f840389dfcc8e852bd6b80": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "9798bbfc9c19405a843d6e298606097c": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "b9838a0579234ed0baddf8549fd2d493": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "bbb319f1f4d246f4aaf819ff05ce5b76": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "bc0f92def39a4bd68ea67c6ee7e6ce9c": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "ce8099b73db84e03ae0c01afc5103f9e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxModel",
+      "state": {
+       "description": "Prepare INT4 model",
+       "disabled": false,
+       "layout": "IPY_MODEL_1e9dc777f98642e6b9d4814673c5bf02",
+       "style": "IPY_MODEL_9798bbfc9c19405a843d6e298606097c",
+       "value": true
+      }
+     },
+     "cf56e91842934e52a73391c7a740105a": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "ed8d5d3e451f40a7a12820629c63fc28": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "ee02a8c34bb0418eb1b80dcabae85481": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DropdownModel",
+      "state": {
+       "_options_labels": [
+        "CPU",
+        "GPU.0",
+        "GPU.1",
+        "AUTO"
+       ],
+       "description": "Device:",
+       "index": 0,
+       "layout": "IPY_MODEL_6fd1d9c0d42a457cbf5f6fcf48e3f78a",
+       "style": "IPY_MODEL_bbb319f1f4d246f4aaf819ff05ce5b76"
+      }
+     },
+     "f0e1516d95504a05bbd9be972e27a32d": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DropdownModel",
+      "state": {
+       "_options_labels": [
+        "INT4",
+        "FP16"
+       ],
+       "description": "Model to run:",
+       "index": 0,
+       "layout": "IPY_MODEL_cf56e91842934e52a73391c7a740105a",
+       "style": "IPY_MODEL_49c4796193264023ad2b89cd5483abb4"
+      }
+     }
+    },
     "version_major": 2,
     "version_minor": 0
    }
diff --git a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
index 923e60b3cc5..15d17007fe8 100644
--- a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
+++ b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
@@ -64,12 +64,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001B[33mWARNING: Skipping openvino-dev as it is not installed.\u001B[0m\u001B[33m\n",
-      "\u001B[0m\u001B[33mWARNING: Skipping openvino as it is not installed.\u001B[0m\u001B[33m\n",
-      "\u001B[0mNote: you may need to restart the kernel to use updated packages.\n",
+      "\u001b[33mWARNING: Skipping openvino-dev as it is not installed.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\u001b[33mWARNING: Skipping openvino as it is not installed.\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n",
       "\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.3.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
     }
@@ -78,7 +78,7 @@
     "%pip uninstall -q -y openvino-dev openvino openvino-nightly optimum optimum-intel\n",
     "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\\\n",
     "\"git+https://github.com/huggingface/optimum-intel.git\"\\\n",
-    "\"nncf>=2.8.0\"\\\n",
+    "\"git+https://github.com/openvinotoolkit/nncf.git\"\\\n",
     "\"datasets\"\\\n",
     "\"accelerate\"\\\n",
     "\"openvino-nightly\"\\\n",
@@ -106,10 +106,41 @@
     "The available LLM model options are:\n",
     "\n",
     "* **tiny-llama-1b-chat** - This is the chat model finetuned on top of [TinyLlama/TinyLlama-1.1B-intermediate-step-955k-2T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T). The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens with the adoption of the same architecture and tokenizer as Llama 2. This means TinyLlama can be plugged and played in many open-source projects built upon Llama. Besides, TinyLlama is compact with only 1.1B parameters. This compactness allows it to cater to a multitude of applications demanding a restricted computation and memory footprint. More details about model can be found in [model card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.6)\n",
+    "*  **gemma-2b-it** - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights, pre-trained variants, and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. This model is instruction-tuned version of 2B parameters model. More details about model can be found in [model card](https://huggingface.co/google/gemma-2b-it).\n",
+    ">**Note**: run model with demo, you will need to accept license agreement. \n",
+    ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/google/gemma-2b-it), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n",
+    ">You can login on Hugging Face Hub in notebook environment, using following code:\n",
+    " \n",
+    "```python\n",
+    "    ## login to huggingfacehub to get access to pretrained model \n",
+    "\n",
+    "    from huggingface_hub import notebook_login, whoami\n",
+    "\n",
+    "    try:\n",
+    "        whoami()\n",
+    "        print('Authorization token already provided')\n",
+    "    except OSError:\n",
+    "        notebook_login()\n",
+    "```\n",
     "*  **mini-cpm-2b-dpo** - MiniCPM is an End-Size LLM developed by ModelBest Inc. and TsinghuaNLP, with only 2.4B parameters excluding embeddings. After Direct Preference Optimization (DPO) fine-tuning, MiniCPM outperforms many popular 7b, 13b and 70b models. More details can be found in [model_card](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp16).\n",
     "* **red-pajama-3b-chat** - A 2.8B parameter pre-trained language model based on GPT-NEOX architecture. It was developed by Together Computer and leaders from the open-source AI community. The model is fine-tuned on OASST1 and Dolly2 datasets to enhance chatting ability. More details about model can be found in [HuggingFace model card](https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1).\n",
+    "*  **gemma-7b-it** - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights, pre-trained variants, and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. This model is instruction-tuned version of 7B parameters model. More details about model can be found in [model card](https://huggingface.co/google/gemma-7b-it).\n",
+    ">**Note**: run model with demo, you will need to accept license agreement. \n",
+    ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/google/gemma-7b-it), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n",
+    ">You can login on Hugging Face Hub in notebook environment, using following code:\n",
+    " \n",
+    "```python\n",
+    "    ## login to huggingfacehub to get access to pretrained model \n",
+    "\n",
+    "    from huggingface_hub import notebook_login, whoami\n",
+    "\n",
+    "    try:\n",
+    "        whoami()\n",
+    "        print('Authorization token already provided')\n",
+    "    except OSError:\n",
+    "        notebook_login()\n",
+    "```\n",
     "* **llama-2-7b-chat** - LLama 2 is the second generation of LLama models developed by Meta. Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. llama-2-7b-chat is 7 billions parameters version of LLama 2 finetuned and optimized for dialogue use case. More details about model can be found in the [paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/), [repository](https://github.com/facebookresearch/llama) and [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).\n",
-    "* **qwen1.5-7b-chat** - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. Qwen1.5 is a language model series including decoder language models of different model sizes. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention. You can find more details about model in the [model card](https://huggingface.co/Qwen/Qwen1.5-7B-Chat).\n",
     ">**Note**: run model with demo, you will need to accept license agreement. \n",
     ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n",
     ">You can login on Hugging Face Hub in notebook environment, using following code:\n",
@@ -125,6 +156,7 @@
     "    except OSError:\n",
     "        notebook_login()\n",
     "```\n",
+    "* **qwen1.5-7b-chat** - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. Qwen1.5 is a language model series including decoder language models of different model sizes. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention. You can find more details about model in the [model card](https://huggingface.co/Qwen/Qwen1.5-7B-Chat).\n",
     "* **mpt-7b-chat** - MPT-7B is part of the family of MosaicPretrainedTransformer (MPT) models, which use a modified transformer architecture optimized for efficient training and inference. These architectural changes include performance-optimized layer implementations and the elimination of context length limits by replacing positional embeddings with Attention with Linear Biases ([ALiBi](https://arxiv.org/abs/2108.12409)). Thanks to these modifications, MPT models can be trained with high throughput efficiency and stable convergence. MPT-7B-chat is a chatbot-like model for dialogue generation. It was built by finetuning MPT-7B on the [ShareGPT-Vicuna](https://huggingface.co/datasets/jeffwan/sharegpt_vicuna), [HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3), [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca), [HH-RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf), and [Evol-Instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k) datasets. More details about the model can be found in [blog post](https://www.mosaicml.com/blog/mpt-7b), [repository](https://github.com/mosaicml/llm-foundry/) and [HuggingFace model card](https://huggingface.co/mosaicml/mpt-7b-chat).\n",
     "* **chatglm3-6b** - ChatGLM3-6B is the latest open-source model in the ChatGLM series. While retaining many excellent features such as smooth dialogue and low deployment threshold from the previous two generations, ChatGLM3-6B employs a more diverse training dataset, more sufficient training steps, and a more reasonable training strategy. ChatGLM3-6B adopts a newly designed [Prompt format](https://github.com/THUDM/ChatGLM3/blob/main/PROMPT_en.md), in addition to the normal multi-turn dialogue. You can find more details about model in the [model card](https://huggingface.co/THUDM/chatglm3-6b)\n",
     "* **mistral-7b** - The Mistral-7B-v0.1 Large Language Model (LLM) is a pretrained generative text model with 7 billion parameters. You can find more details about model in the [model card](https://huggingface.co/mistralai/Mistral-7B-v0.1), [paper](https://arxiv.org/abs/2310.06825) and [release blog post](https://mistral.ai/news/announcing-mistral-7b/).\n",
@@ -1478,7 +1510,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.8.10"
   },
   "openvino_notebooks": {
    "imageUrl": "",
@@ -1496,873 +1528,7 @@
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "073fa67d3868462d868acde42f8fe6b0": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "0a270c30bd8d467d92a663efc0b05888": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_e1e19ff5fd984db4ab97d31982e42e65",
-       "style": "IPY_MODEL_e3419ab81c804699a489d6f27b27b8d2",
-       "value": " 363/363 [00:00&lt;00:00, 26.4kB/s]"
-      }
-     },
-     "1612e7d50c304ab787ab2cd25991af22": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_76faba05646d4480a207923296603e0c",
-        "IPY_MODEL_66dab8bd018549c2a28714fa5262ecce",
-        "IPY_MODEL_b9c885bd35d94a23887ad8a8641d7234"
-       ],
-       "layout": "IPY_MODEL_3a488af82edd462faa61ea686d84f581"
-      }
-     },
-     "18069a604ea84516aaa6f29f2cd688e8": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_8576d3a3283746e8ac80be86a755a2c5",
-       "style": "IPY_MODEL_e86f6b5a8272467e878b769cc561becb",
-       "value": " 571/571 [00:00&lt;00:00, 41.6kB/s]"
-      }
-     },
-     "1a4b5c3b175943f28acf7e251e395f53": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "1c516c8f817145fd9244146cb8bfd3e1": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "24923dbb2cab42e98c8d5f3ebb5ec5be": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_cacdb640580242cbb0fb7be45e8713a3",
-       "max": 571,
-       "style": "IPY_MODEL_9c0305d21f0f45ec9ad88b69d78326ae",
-       "value": 571
-      }
-     },
-     "24b9443091314031b026d84c7e42f4ac": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_e3c6ff1ec7b147ada3120d459cb7600d",
-       "max": 239,
-       "style": "IPY_MODEL_327806293d3d4617b8bd94924ac69549",
-       "value": 239
-      }
-     },
-     "254081e676514e92a69232264175f1c2": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_305c75cc3ef84bb880f2861022f4bc1e",
-       "max": 438011953,
-       "style": "IPY_MODEL_28f1f3eeb69b4d26b73e6726d716e62b",
-       "value": 438011953
-      }
-     },
-     "28f1f3eeb69b4d26b73e6726d716e62b": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "29cdb10da5a742778ae0fa7208b29b54": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "2a0898c10f484b3b9d9c415cf48aca65": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "2a0ac80b0b5b49a9abec32010e70bbe0": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "2a75b24f32d04ce7813eff46edefef19": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "2c66e599fa014417a89a23bb80af6ee2": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_5ebee19b2b5c46a89b27331e20a3bf74",
-       "max": 363,
-       "style": "IPY_MODEL_564ecb88d9d442cba93532e7fdb50383",
-       "value": 363
-      }
-     },
-     "2d6e5f08617e4b2c8e434e3af764908d": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_d239ddefafbe48e18acf4eaef67cb215",
-        "IPY_MODEL_24923dbb2cab42e98c8d5f3ebb5ec5be",
-        "IPY_MODEL_18069a604ea84516aaa6f29f2cd688e8"
-       ],
-       "layout": "IPY_MODEL_2a75b24f32d04ce7813eff46edefef19"
-      }
-     },
-     "305c75cc3ef84bb880f2861022f4bc1e": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "31f9a2a5b0a14f62ac0ab4783a502ae4": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "INT4"
-       ],
-       "description": "Model to run:",
-       "index": 0,
-       "layout": "IPY_MODEL_f8eeb34363604db2afa343d7282dda3c",
-       "style": "IPY_MODEL_1a4b5c3b175943f28acf7e251e395f53"
-      }
-     },
-     "327806293d3d4617b8bd94924ac69549": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "327b4cc2f1ec4c29a0c3bebf66d5196e": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "37d7d309c98f43f4a064af846b964aff": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "3a488af82edd462faa61ea686d84f581": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "3b43602baed04ab6baa5a143ec2e0df0": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_1c516c8f817145fd9244146cb8bfd3e1",
-       "style": "IPY_MODEL_2a0ac80b0b5b49a9abec32010e70bbe0",
-       "value": "pytorch_model.bin: 100%"
-      }
-     },
-     "3f9c3677e64142c884d6ed1a8ac0f200": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "42200eddb9c34232a9b39bb35dfc274e": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "4329dd50bb3c4520a3eab387bd59eeac": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxModel",
-      "state": {
-       "description": "Prepare FP16 model",
-       "disabled": false,
-       "layout": "IPY_MODEL_b9aa2130c3f94a468a79690e0c0c7f3c",
-       "style": "IPY_MODEL_29cdb10da5a742778ae0fa7208b29b54",
-       "value": false
-      }
-     },
-     "47edd4e0b5d647d4a13831bca3d45f88": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "48436392bbe846fdbec65d641ea28e26": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "4c88bcc7bcfa49e4b547ab84149ac936": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "4ccfb402e4494197b318bc8df395424a": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "4d3a329214ee48509fe4096433407cc0": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxModel",
-      "state": {
-       "description": "Prepare INT4 model",
-       "disabled": false,
-       "layout": "IPY_MODEL_3f9c3677e64142c884d6ed1a8ac0f200",
-       "style": "IPY_MODEL_a09f75b1930b4e30bfbf7ec1e8f8f423",
-       "value": true
-      }
-     },
-     "509029f7043c4626a9de59ec2a28b172": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "54f908d9de5446d686dc2dd38b42a3cb": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxModel",
-      "state": {
-       "description": "Prepare INT8 model",
-       "disabled": false,
-       "layout": "IPY_MODEL_f0dce48186e143c48d0b12874faaaf47",
-       "style": "IPY_MODEL_c461f703ad3b4fceb56d2ec9b6f9de19",
-       "value": false
-      }
-     },
-     "562c105da8024178824d6cc3e6f2b573": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_b22c9425b3734f9a82f66b8b250bf125",
-        "IPY_MODEL_24b9443091314031b026d84c7e42f4ac",
-        "IPY_MODEL_ac46a1284d424fe780d536885cd2fc45"
-       ],
-       "layout": "IPY_MODEL_ae5b61bd5ab343bcac9445b77f2e689e"
-      }
-     },
-     "564ecb88d9d442cba93532e7fdb50383": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "5d98e96ddcb544ecbd836d20d901e2b4": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "CPU",
-        "GPU.0",
-        "GPU.1",
-        "AUTO"
-       ],
-       "description": "Device:",
-       "index": 0,
-       "layout": "IPY_MODEL_678be55661844b02bfe3c8c48f830e74",
-       "style": "IPY_MODEL_42200eddb9c34232a9b39bb35dfc274e"
-      }
-     },
-     "5ebee19b2b5c46a89b27331e20a3bf74": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "66dab8bd018549c2a28714fa5262ecce": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_37d7d309c98f43f4a064af846b964aff",
-       "max": 231536,
-       "style": "IPY_MODEL_8d302c7809d44d119a528c08853a4616",
-       "value": 231536
-      }
-     },
-     "678be55661844b02bfe3c8c48f830e74": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "6a69c024e6f44984b9fc50a2c52d353f": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "6f6d200362cd46b2b4adad1561ca4c27": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_8672e209f5cc4f91ab07b6cc0ac0bb28",
-        "IPY_MODEL_2c66e599fa014417a89a23bb80af6ee2",
-        "IPY_MODEL_0a270c30bd8d467d92a663efc0b05888"
-       ],
-       "layout": "IPY_MODEL_92d67ee6bc8148be8628ff864171e919"
-      }
-     },
-     "76faba05646d4480a207923296603e0c": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_bb41c7e24b2f413883f3e6ebdbf6a85d",
-       "style": "IPY_MODEL_073fa67d3868462d868acde42f8fe6b0",
-       "value": "vocab.txt: 100%"
-      }
-     },
-     "80496ec674924717b4ae48a59cdf37ff": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "8576d3a3283746e8ac80be86a755a2c5": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "860638e709f44264a8c8e12ae8ac7716": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "8672e209f5cc4f91ab07b6cc0ac0bb28": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_effc71050737431a82da7529a5e567b1",
-       "style": "IPY_MODEL_ab6d89a943b24ef093df38384acf6eb4",
-       "value": "tokenizer_config.json: 100%"
-      }
-     },
-     "868bc11676d8497ea436faee99051f47": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "8d302c7809d44d119a528c08853a4616": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "8f177a485ea248cab3a7d1513da8aaaf": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "92d67ee6bc8148be8628ff864171e919": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "94e33d222041417eb876dabcb1103636": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "960f1e44f89e4b079d759ac4f71e1878": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "96e4bba6729b43d2a13b8b2e0d5d98fa": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "9c0305d21f0f45ec9ad88b69d78326ae": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "9c14b642a6d646658ea28bd8ae668082": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "a09f75b1930b4e30bfbf7ec1e8f8f423": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "a86cbfb44d3843b9beb57cb076f9afe7": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "CPU",
-        "GPU.0",
-        "GPU.1",
-        "AUTO"
-       ],
-       "description": "Device:",
-       "index": 0,
-       "layout": "IPY_MODEL_9c14b642a6d646658ea28bd8ae668082",
-       "style": "IPY_MODEL_960f1e44f89e4b079d759ac4f71e1878"
-      }
-     },
-     "ab6d89a943b24ef093df38384acf6eb4": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "ac46a1284d424fe780d536885cd2fc45": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_80496ec674924717b4ae48a59cdf37ff",
-       "style": "IPY_MODEL_48436392bbe846fdbec65d641ea28e26",
-       "value": " 239/239 [00:00&lt;00:00, 21.0kB/s]"
-      }
-     },
-     "ae5b61bd5ab343bcac9445b77f2e689e": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "aff572932d3747a8a851cb7b2d319035": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "b09adb3b4ac94ce88a3e7b27f740dc30": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "b22c9425b3734f9a82f66b8b250bf125": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_f06fad8b77354ee194e31f499dad4b4d",
-       "style": "IPY_MODEL_327b4cc2f1ec4c29a0c3bebf66d5196e",
-       "value": "special_tokens_map.json: 100%"
-      }
-     },
-     "b41cef77af61419f9d51acf19c8da9a1": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "b5e4d686d6b24dc6a97b4e087ceafe10": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "tiny-llama-1b-chat",
-        "minicpm-2b-dpo",
-        "red-pajama-3b-chat",
-        "llama-2-chat-7b",
-        "mpt-7b-chat",
-        "qwen-7b-chat",
-        "chatglm3-6b",
-        "mistral-7b",
-        "zephyr-7b-beta",
-        "neural-chat-7b-v3-1",
-        "notus-7b-v1",
-        "youri-7b-chat"
-       ],
-       "description": "LLM Model:",
-       "index": 0,
-       "layout": "IPY_MODEL_8f177a485ea248cab3a7d1513da8aaaf",
-       "style": "IPY_MODEL_fbf0b9169ad547789ccd45ed9a4f5ce9"
-      }
-     },
-     "b8227d878838476caa6a97be9971bc93": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "b9aa2130c3f94a468a79690e0c0c7f3c": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "b9c885bd35d94a23887ad8a8641d7234": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_4c88bcc7bcfa49e4b547ab84149ac936",
-       "style": "IPY_MODEL_dae19cd938e4427cabef195944543a0e",
-       "value": " 232k/232k [00:00&lt;00:00, 481kB/s]"
-      }
-     },
-     "bb1580e67c674d4f9f1a7d2539ef81df": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "bb41c7e24b2f413883f3e6ebdbf6a85d": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "c461f703ad3b4fceb56d2ec9b6f9de19": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "c693bbd2c1244da5a5a16fcf5de2bf80": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_e19c5a3c98c448d5bf0bff5fb595a563",
-       "max": 466021,
-       "style": "IPY_MODEL_fb09beb84c5343a6a4d7a3c938152ff9",
-       "value": 466021
-      }
-     },
-     "cacdb640580242cbb0fb7be45e8713a3": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "cd45aebea8dc427f9d8a06c82b5fed69": {
-      "model_module": "@jupyter-widgets/output",
-      "model_module_version": "1.0.0",
-      "model_name": "OutputModel",
-      "state": {
-       "layout": "IPY_MODEL_b8227d878838476caa6a97be9971bc93",
-       "outputs": [
-        {
-         "data": {
-          "text/html": "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Searching for Mixed-Precision Configuration <span style=\"color: #729c1f; text-decoration-color: #729c1f\">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span style=\"color: #800080; text-decoration-color: #800080\">100%</span> <span style=\"color: #0068b5; text-decoration-color: #0068b5\">154/154</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:12</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:00</span>\n</pre>\n",
-          "text/plain": "Searching for Mixed-Precision Configuration \u001B[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[35m100%\u001B[0m \u001B[38;2;0;104;181m154/154\u001B[0m • \u001B[38;2;0;104;181m0:00:12\u001B[0m • \u001B[38;2;0;104;181m0:00:00\u001B[0m\n"
-         },
-         "metadata": {},
-         "output_type": "display_data"
-        }
-       ]
-      }
-     },
-     "cfb7aa76210e4f83b3ce1c66b1510b79": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_f1fd1e5c2ee440deb144585b7fc16495",
-        "IPY_MODEL_c693bbd2c1244da5a5a16fcf5de2bf80",
-        "IPY_MODEL_d5983501c77441b8a70bff1b1a847894"
-       ],
-       "layout": "IPY_MODEL_860638e709f44264a8c8e12ae8ac7716"
-      }
-     },
-     "d09e2900363e46509a2723e3c797d33a": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "all-mpnet-base-v2"
-       ],
-       "description": "Embedding Model:",
-       "index": 0,
-       "layout": "IPY_MODEL_2a0898c10f484b3b9d9c415cf48aca65",
-       "style": "IPY_MODEL_6a69c024e6f44984b9fc50a2c52d353f"
-      }
-     },
-     "d239ddefafbe48e18acf4eaef67cb215": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_b09adb3b4ac94ce88a3e7b27f740dc30",
-       "style": "IPY_MODEL_4ccfb402e4494197b318bc8df395424a",
-       "value": "config.json: 100%"
-      }
-     },
-     "d5983501c77441b8a70bff1b1a847894": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_96e4bba6729b43d2a13b8b2e0d5d98fa",
-       "style": "IPY_MODEL_94e33d222041417eb876dabcb1103636",
-       "value": " 466k/466k [00:00&lt;00:00, 1.86MB/s]"
-      }
-     },
-     "d7cf405016774569a21930dfe2214f59": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_509029f7043c4626a9de59ec2a28b172",
-       "style": "IPY_MODEL_868bc11676d8497ea436faee99051f47",
-       "value": " 438M/438M [00:24&lt;00:00, 19.2MB/s]"
-      }
-     },
-     "dae19cd938e4427cabef195944543a0e": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "e19c5a3c98c448d5bf0bff5fb595a563": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "e1e19ff5fd984db4ab97d31982e42e65": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "e3419ab81c804699a489d6f27b27b8d2": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "e3c6ff1ec7b147ada3120d459cb7600d": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "e86f6b5a8272467e878b769cc561becb": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "e8d92cadd240427e9e8dc606c7654806": {
-      "model_module": "@jupyter-widgets/output",
-      "model_module_version": "1.0.0",
-      "model_name": "OutputModel",
-      "state": {
-       "layout": "IPY_MODEL_b41cef77af61419f9d51acf19c8da9a1",
-       "outputs": [
-        {
-         "data": {
-          "text/html": "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Applying Weight Compression <span style=\"color: #729c1f; text-decoration-color: #729c1f\">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span style=\"color: #800080; text-decoration-color: #800080\">100%</span> <span style=\"color: #0068b5; text-decoration-color: #0068b5\">156/156</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:26</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:00</span>\n</pre>\n",
-          "text/plain": "Applying Weight Compression \u001B[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[35m100%\u001B[0m \u001B[38;2;0;104;181m156/156\u001B[0m • \u001B[38;2;0;104;181m0:00:26\u001B[0m • \u001B[38;2;0;104;181m0:00:00\u001B[0m\n"
-         },
-         "metadata": {},
-         "output_type": "display_data"
-        }
-       ]
-      }
-     },
-     "ec344fcaf9f84a4fbcc7981168c28f4e": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_3b43602baed04ab6baa5a143ec2e0df0",
-        "IPY_MODEL_254081e676514e92a69232264175f1c2",
-        "IPY_MODEL_d7cf405016774569a21930dfe2214f59"
-       ],
-       "layout": "IPY_MODEL_aff572932d3747a8a851cb7b2d319035"
-      }
-     },
-     "effc71050737431a82da7529a5e567b1": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "f06fad8b77354ee194e31f499dad4b4d": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "f0dce48186e143c48d0b12874faaaf47": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "f1fd1e5c2ee440deb144585b7fc16495": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_bb1580e67c674d4f9f1a7d2539ef81df",
-       "style": "IPY_MODEL_47edd4e0b5d647d4a13831bca3d45f88",
-       "value": "tokenizer.json: 100%"
-      }
-     },
-     "f8eeb34363604db2afa343d7282dda3c": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "fb09beb84c5343a6a4d7a3c938152ff9": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "fbf0b9169ad547789ccd45ed9a4f5ce9": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     }
-    },
+    "state": {},
     "version_major": 2,
     "version_minor": 0
    }
diff --git a/notebooks/254-llm-chatbot/README.md b/notebooks/254-llm-chatbot/README.md
index 8c48b0ad1eb..d9febb2288d 100644
--- a/notebooks/254-llm-chatbot/README.md
+++ b/notebooks/254-llm-chatbot/README.md
@@ -12,20 +12,26 @@ The tutorial supports different models, you can select one from provided options
 
 The available options are:
 
-* **tiny-llama-1b-chat** - This is the chat model finetuned on top of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T). The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens with the adoption of the same architecture and tokenizer as Llama 2. This means TinyLlama can be plugged and played in many open-source projects built upon Llama. Besides, TinyLlama is compact with only 1.1B parameters. This compactness allows it to cater to a multitude of applications demanding a restricted computation and memory footprint. More details about model can be found in [model card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0).
+* **tiny-llama-1b-chat** - This is the chat model finetuned on top of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T). The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens with the adoption of the same architecture and tokenizer as Llama 2. This means TinyLlama can be plugged and played in many open-source projects built upon Llama. Besides, TinyLlama is compact with only 1.1B parameters. This compactness allows it to cater to a multitude of applications demanding a restricted computation and memory footprint. More details about model can be found in [model card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0)
 *  **mini-cpm-2b-dpo** - MiniCPM is an End-Size LLM developed by ModelBest Inc. and TsinghuaNLP, with only 2.4B parameters excluding embeddings. After Direct Preference Optimization (DPO) fine-tuning, MiniCPM outperforms many popular 7b, 13b and 70b models. More details can be found in [model_card](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp16).
-* **red-pajama-3b-chat** - A 2.8B parameter pretrained language model based on GPT-NEOX architecture. It was developed by Together Computer and leaders from the open-source AI community. The model is fine-tuned on OASST1 and Dolly2 datasets to enhance chatting ability. More details about model can be found in [HuggingFace model card](https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1).
-* **llama-2-7b-chat** - LLama 2 is the second generation of LLama models developed by Meta. Llama 2 is a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. llama-2-7b-chat is 7 billions parameters version of LLama 2 finetuned and optimized for dialogue use case. More details about model can be found in the [paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/), [repository](https://github.com/facebookresearch/llama) and [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
-* **qwen1.5-7b-chat** - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. Qwen1.5 is a language model series including decoder language models of different model sizes. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention. You can find more details about model in the [model card](https://huggingface.co/Qwen/Qwen1.5-7B-Chat).
+*  **gemma-2b-it** - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights, pre-trained variants, and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. This model is instruction-tuned version of 2B parameters model. More details about model can be found in [model card](https://huggingface.co/google/gemma-2b-it).
+>**Note**: run model with demo, you will need to accept license agreement. 
+>You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/google/gemma-2b-it), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+* **red-pajama-3b-chat** - A 2.8B parameter pre-trained language model based on GPT-NEOX architecture. It was developed by Together Computer and leaders from the open-source AI community. The model is fine-tuned on OASST1 and Dolly2 datasets to enhance chatting ability. More details about model can be found in [HuggingFace model card](https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1).
+*  **gemma-7b-it** - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights, pre-trained variants, and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. This model is instruction-tuned version of 7B parameters model. More details about model can be found in [model card](https://huggingface.co/google/gemma-7b-it).
 >**Note**: run model with demo, you will need to accept license agreement. 
->You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), carefully read terms of usage and click accept button.  You will need to use an access token for downloading model. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
-* **mpt-7b-chat** - MPT-7B is part of the family of MosaicPretrainedTransformer (MPT) models, which use a modified transformer architecture optimized for efficient training and inference. These architectural changes include performance-optimized layer implementations and the elimination of context length limits by replacing positional embeddings with Attention with Linear Biases ([ALiBi](https://arxiv.org/abs/2108.12409)). Thanks to these modifications, MPT models can be trained with high throughput efficiency and stable convergence. MPT-7B-chat is a chatbot-like model for dialogue generation. It was built by finetuning MPT-7B on the [ShareGPT-Vicuna](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered), [HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3), [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca), [HH-RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf), and [Evol-Instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k) datasets. More details about model can be found in [blog post](https://www.mosaicml.com/blog/mpt-7b), [repository](https://github.com/mosaicml/llm-foundry/) and [HuggingFace model card](https://huggingface.co/mosaicml/mpt-7b-chat).
+>You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/google/gemma-7b-it), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+* **llama-2-7b-chat** - LLama 2 is the second generation of LLama models developed by Meta. Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. llama-2-7b-chat is 7 billions parameters version of LLama 2 finetuned and optimized for dialogue use case. More details about model can be found in the [paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/), [repository](https://github.com/facebookresearch/llama) and [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
+>**Note**: run model with demo, you will need to accept license agreement. 
+>You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), carefully read terms of usage and click accept button.  You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+* **qwen1.5-7b-chat** - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. Qwen1.5 is a language model series including decoder language models of different model sizes. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention. You can find more details about model in the [model card](https://huggingface.co/Qwen/Qwen1.5-7B-Chat).
+* **mpt-7b-chat** - MPT-7B is part of the family of MosaicPretrainedTransformer (MPT) models, which use a modified transformer architecture optimized for efficient training and inference. These architectural changes include performance-optimized layer implementations and the elimination of context length limits by replacing positional embeddings with Attention with Linear Biases ([ALiBi](https://arxiv.org/abs/2108.12409)). Thanks to these modifications, MPT models can be trained with high throughput efficiency and stable convergence. MPT-7B-chat is a chatbot-like model for dialogue generation. It was built by finetuning MPT-7B on the ShareGPT-Vicuna, [HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3), [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca), [HH-RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf), and [Evol-Instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k) datasets. More details about the model can be found in [blog post](https://www.mosaicml.com/blog/mpt-7b), [repository](https://github.com/mosaicml/llm-foundry/) and [HuggingFace model card](https://huggingface.co/mosaicml/mpt-7b-chat).
 * **chatglm3-6b** - ChatGLM3-6B is the latest open-source model in the ChatGLM series. While retaining many excellent features such as smooth dialogue and low deployment threshold from the previous two generations, ChatGLM3-6B employs a more diverse training dataset, more sufficient training steps, and a more reasonable training strategy. ChatGLM3-6B adopts a newly designed [Prompt format](https://github.com/THUDM/ChatGLM3/blob/main/PROMPT_en.md), in addition to the normal multi-turn dialogue. You can find more details about model in the [model card](https://huggingface.co/THUDM/chatglm3-6b)
-* **mistral-7b** - The Mistral-7B-v0.1 Large Language Model (LLM) is a pretrained generative text model with 7 billion parameters. You can find more details about model in the [paper](https://arxiv.org/abs/2310.06825) and [release blog post](https://mistral.ai/news/announcing-mistral-7b/).
+* **mistral-7b** - The Mistral-7B-v0.1 Large Language Model (LLM) is a pretrained generative text model with 7 billion parameters. You can find more details about model in the [model card](https://huggingface.co/mistralai/Mistral-7B-v0.1), [paper](https://arxiv.org/abs/2310.06825) and [release blog post](https://mistral.ai/news/announcing-mistral-7b/).
 * **zephyr-7b-beta** - Zephyr is a series of language models that are trained to act as helpful assistants. Zephyr-7B-beta is the second model in the series, and is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) that was trained on on a mix of publicly available, synthetic datasets using [Direct Preference Optimization (DPO)](https://arxiv.org/abs/2305.18290). You can find more details about model in [technical report](https://arxiv.org/abs/2310.16944) and [HuggingFace model card](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta).
 * **neural-chat-7b-v3-1** - Mistral-7b model fine-tuned using Intel Gaudi. The model fine-tuned on the open source dataset [Open-Orca/SlimOrca](https://huggingface.co/datasets/Open-Orca/SlimOrca) and aligned with [Direct Preference Optimization (DPO) algorithm](https://arxiv.org/abs/2305.18290). More details can be found in [model card](https://huggingface.co/Intel/neural-chat-7b-v3-3) and [blog post](https://medium.com/@NeuralCompressor/the-practice-of-supervised-finetuning-and-direct-preference-optimization-on-habana-gaudi2-a1197d8a3cd3).
 * **notus-7b-v1** - Notus is a collection of fine-tuned models using [Direct Preference Optimization (DPO)](https://arxiv.org/abs/2305.18290). and related [RLHF](https://huggingface.co/blog/rlhf) techniques. This model is the first version, fine-tuned with DPO over zephyr-7b-sft. Following a data-first approach, the only difference between Notus-7B-v1 and Zephyr-7B-beta is the preference dataset used for dDPO. Proposed approach for dataset creation helps to effectively fine-tune Notus-7b that surpasses Zephyr-7B-beta and Claude 2 on [AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/). More details about model can be found in [model card](https://huggingface.co/argilla/notus-7b-v1).
-* **youri-7b-chat** - Youri-7b-chat is a Llama2 based model. [Rinna Co., Ltd.](https://rinna.co.jp/) conducted further pre-training for the Llama2 model with a mixture of English and Japanese datasets to improve Japanese task capability. The model is publicly released on Hugging Face hub. You can find detailed information at the [rinna/youri-7b-chat project page](https://huggingface.co/rinna/youri-7b).
+* **youri-7b-chat** - Youri-7b-chat is a Llama2 based model. [Rinna Co., Ltd.](https://rinna.co.jp/) conducted further pre-training for the Llama2 model with a mixture of English and Japanese datasets to improve Japanese task capability. The model is publicly released on Hugging Face hub. You can find detailed information at the [rinna/youri-7b-chat project page](https://huggingface.co/rinna/youri-7b). 
 * **baichuan2-7b-chat** - Baichuan 2 is the new generation of large-scale open-source language models launched by [Baichuan Intelligence inc](https://www.baichuan-ai.com/home). It is trained on a high-quality corpus with 2.6 trillion tokens and has achieved the best performance in authoritative Chinese and English benchmarks of the same size.
 
 The image below illustrates the provided user instruction and model answer examples.
diff --git a/notebooks/254-llm-chatbot/config.py b/notebooks/254-llm-chatbot/config.py
index 54be1e5632b..828654285e4 100644
--- a/notebooks/254-llm-chatbot/config.py
+++ b/notebooks/254-llm-chatbot/config.py
@@ -86,6 +86,14 @@ def youri_partial_text_processor(partial_text, new_text):
         Answer: </s>
         <|assistant|>""",
     },
+    "gemma-2b-it": {
+        "model_id": "google/gemma-2b-it",
+        "remote": True,
+        "start_message": DEFAULT_SYSTEM_PROMPT + ", ",
+        "history_template": "<start_of_turn>user{user}<end_of_turn><start_of_turn>model{assistant}<end_of_turn>",
+        "current_message_template": "<start_of_turn>user{user}<end_of_turn><start_of_turn>model{assistant}",
+        "prompt_template": f"""{DEFAULT_RAG_PROMPT},"""+"""<start_of_turn>user{question}<end_of_turn><start_of_turn>context{context}<end_of_turn><start_of_turn>model"""
+    },
     "red-pajama-3b-chat": {
         "model_id": "togethercomputer/RedPajama-INCITE-Chat-3B-v1",
         "remote": False,
@@ -100,6 +108,14 @@ def youri_partial_text_processor(partial_text, new_text):
         Context: {context} 
         Answer: <bot>""",
     },
+    "gemma-7b-it": {
+        "model_id": "google/gemma-7b-it",
+        "remote": True,
+        "start_message": DEFAULT_SYSTEM_PROMPT + ", ",
+        "history_template": "<start_of_turn>user{user}<end_of_turn><start_of_turn>model{assistant}<end_of_turn>",
+        "current_message_template": "<start_of_turn>user{user}<end_of_turn><start_of_turn>model{assistant}",
+        "prompt_template": f"""{DEFAULT_RAG_PROMPT},"""+"""<start_of_turn>user{question}<end_of_turn><start_of_turn>context{context}<end_of_turn><start_of_turn>model"""
+    },
     "llama-2-chat-7b": {
         "model_id": "meta-llama/Llama-2-7b-chat-hf",
         "remote": False,
diff --git a/notebooks/254-llm-chatbot/converter.py b/notebooks/254-llm-chatbot/converter.py
index cd2b6b41caf..632c11797ed 100644
--- a/notebooks/254-llm-chatbot/converter.py
+++ b/notebooks/254-llm-chatbot/converter.py
@@ -377,13 +377,75 @@ def convert_chatglm(pt_model: torch.nn.Module, model_path: Path):
 
     ov_model.validate_nodes_and_infer_types()
     if make_stateful is not None:
-        print("PATCH STATEFUL")
         patch_stateful(ov_model, "chatglm")
     ov.save_model(ov_model, ov_out_path)
     del ov_model
     cleanup_torchscript_cache()
     del pt_model
 
+def convert_gemma(pt_model: torch.nn.Module, model_path: Path):
+    """
+    Gamma model conversion function
+
+    Params:
+      pt_model: PyTorch model
+      model_path: path for saving model
+    Returns:
+      None
+    """
+    ov_out_path = Path(model_path) / "openvino_model.xml"
+    pt_model.config.save_pretrained(ov_out_path.parent)
+    pt_model.config.use_cache = True
+    outs = pt_model(input_ids=torch.ones((2, 10), dtype=torch.long))
+    inputs = ["input_ids"]
+    outputs = ["logits"]
+
+    dynamic_shapes = {
+        "input_ids": {0: "batch_size", 1: "seq_len"},
+        "attention_mask": {0: "batch_size", 1: "seq_len"},
+        "position_ids": {0: "batch_size", 1: "seq_len"},
+    }
+    inputs += ["attention_mask", "position_ids"]
+    for idx in range(len(outs.past_key_values)):
+        inputs.extend([f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"])
+        dynamic_shapes[inputs[-1]] = {0: "batch_size", 2: "past_sequence + sequence"}
+        dynamic_shapes[inputs[-2]] = {0: "batch_size", 2: "past_sequence + sequence"}
+        outputs.extend([f"present.{idx}.key", f"present.{idx}.value"])
+
+    dummy_inputs = {
+        "input_ids": torch.ones((2, 2), dtype=torch.long),
+        "attention_mask": torch.ones((2, 12), dtype=torch.long),
+        "position_ids": torch.tensor([[10, 11], [10, 11]], dtype=torch.long),
+        "past_key_values": outs.past_key_values,
+    }
+    pt_model.config.torchscript = True
+    ov_model = ov.convert_model(pt_model, example_input=dummy_inputs)
+    for inp_name, m_input, input_data in zip(
+        inputs, ov_model.inputs, flattenize_inputs(dummy_inputs.values())
+    ):
+        input_node = m_input.get_node()
+        if input_node.element_type == ov.Type.dynamic:
+            m_input.get_node().set_element_type(ov.Type.f32)
+        shape = list(input_data.shape)
+        if inp_name in dynamic_shapes:
+            for k in dynamic_shapes[inp_name]:
+                shape[k] = -1
+        input_node.set_partial_shape(ov.PartialShape(shape))
+        m_input.get_tensor().set_names({inp_name})
+
+    for out, out_name in zip(ov_model.outputs, outputs):
+        out.get_tensor().set_names({out_name})
+
+    ov_model.validate_nodes_and_infer_types()
+    if make_stateful is not None:
+        patch_stateful(ov_model, "gemma")
+    ov.save_model(ov_model, ov_out_path)
+    del ov_model
+    cleanup_torchscript_cache()
+    del pt_model
+    
+    
+
 def convert_mpnet(pt_model: torch.nn.Module, model_path: Path):
     ov_out_path = Path(model_path) / "openvino_model.xml"
     dummy_inputs = {"input_ids": torch.ones((1, 10), dtype=torch.long), "attention_mask": torch.ones(
@@ -404,6 +466,7 @@ def convert_bert(pt_model: torch.nn.Module, model_path: Path):
     "mpt": convert_mpt,
     "chatglm3": convert_chatglm,
     "baichuan2": convert_baichuan,
+    "gemma": convert_gemma,
     # embedding models
     "all-mpnet-base-v2": convert_mpnet,
     "text2vec-large-chinese": convert_bert,
diff --git a/notebooks/254-llm-chatbot/ov_llm_model.py b/notebooks/254-llm-chatbot/ov_llm_model.py
index 2a2e82a3bc5..59a53ef4024 100644
--- a/notebooks/254-llm-chatbot/ov_llm_model.py
+++ b/notebooks/254-llm-chatbot/ov_llm_model.py
@@ -367,4 +367,5 @@ def _from_pretrained(
     "mpt": OVMPTModel,
     "baichuan2": OVBAICHUANModel,
     "chatglm3": OVCHATGLMModel,
+    "gemma": OVModelForCausalLM
 }
\ No newline at end of file