From 48b309ede234c70ef9fc592ab8b676cd375e9b00 Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Thu, 29 Aug 2024 17:20:02 +0800
Subject: [PATCH 01/15] dev

---
 xinference/api/restful_api.py                 |   40 +-
 xinference/client/restful/restful_client.py   |   27 +-
 xinference/client/tests/test_client.py        |    5 +-
 xinference/core/chat_interface.py             |   22 +-
 xinference/core/model.py                      |   43 +-
 xinference/core/scheduler.py                  |   56 +-
 xinference/core/supervisor.py                 |    5 +-
 .../core/tests/test_continuous_batching.py    |    7 +-
 xinference/core/tests/test_metrics.py         |    3 +-
 xinference/core/tests/test_restful_api.py     |    3 +-
 xinference/core/tests/test_types.py           |    3 -
 xinference/deploy/cmdline.py                  |   24 +-
 xinference/model/llm/__init__.py              |   30 +-
 xinference/model/llm/llama_cpp/core.py        |   43 +-
 xinference/model/llm/llm_family.json          | 1878 +++++------------
 xinference/model/llm/llm_family.py            |   83 +-
 xinference/model/llm/llm_family_csghub.json   |   53 +-
 .../model/llm/llm_family_modelscope.json      | 1650 +++++----------
 xinference/model/llm/lmdeploy/core.py         |  144 +-
 xinference/model/llm/mlx/core.py              |   97 +-
 xinference/model/llm/mlx/tests/test_mlx.py    |    3 +-
 xinference/model/llm/sglang/core.py           |   46 +-
 xinference/model/llm/tests/test_llm_family.py |  173 +-
 xinference/model/llm/tests/test_multimodal.py |   63 +-
 xinference/model/llm/tests/test_utils.py      |  303 ---
 xinference/model/llm/transformers/chatglm.py  |  472 ++---
 xinference/model/llm/transformers/cogvlm2.py  |   99 +-
 .../model/llm/transformers/cogvlm2_video.py   |  256 +--
 xinference/model/llm/transformers/core.py     |   78 +-
 .../model/llm/transformers/deepseek_vl.py     |  149 +-
 xinference/model/llm/transformers/glm4v.py    |  166 +-
 .../model/llm/transformers/intern_vl.py       |  109 +-
 .../model/llm/transformers/internlm2.py       |   86 +-
 xinference/model/llm/transformers/llama_2.py  |  108 -
 .../model/llm/transformers/minicpmv25.py      |   77 +-
 .../model/llm/transformers/minicpmv26.py      |   78 +-
 xinference/model/llm/transformers/omnilmm.py  |   33 +-
 xinference/model/llm/transformers/qwen_vl.py  |  118 +-
 .../llm/transformers/tests/test_tensorizer.py |    4 +-
 xinference/model/llm/transformers/utils.py    |   72 +-
 xinference/model/llm/transformers/yi_vl.py    |  104 +-
 xinference/model/llm/utils.py                 |  677 ++----
 xinference/model/llm/vllm/core.py             |  227 +-
 xinference/types.py                           |    4 +-
 44 files changed, 2441 insertions(+), 5280 deletions(-)
 delete mode 100644 xinference/model/llm/transformers/llama_2.py

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 5a80941f9f..20a84a21ad 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -57,9 +57,7 @@
 from ..core.supervisor import SupervisorActor
 from ..core.utils import json_dumps
 from ..types import (
-    SPECIAL_TOOL_PROMPT,
     ChatCompletion,
-    ChatCompletionMessage,
     Completion,
     CreateChatCompletion,
     CreateCompletion,
@@ -1627,33 +1625,7 @@ async def create_chat_completion(self, request: Request) -> Response:
                 status_code=400, detail="Invalid input. Please specify the prompt."
             )
 
-        system_messages: List["ChatCompletionMessage"] = []
-        system_messages_contents = []
-        non_system_messages = []
-        for msg in messages:
-            assert (
-                msg.get("content") != SPECIAL_TOOL_PROMPT
-            ), f"Invalid message content {SPECIAL_TOOL_PROMPT}"
-            if msg["role"] == "system":
-                system_messages_contents.append(msg["content"])
-            else:
-                non_system_messages.append(msg)
-        system_messages.append(
-            {"role": "system", "content": ". ".join(system_messages_contents)}
-        )
-
         has_tool_message = messages[-1].get("role") == "tool"
-        if has_tool_message:
-            prompt = SPECIAL_TOOL_PROMPT
-            system_prompt = system_messages[0]["content"] if system_messages else None
-            chat_history = non_system_messages  # exclude the prompt
-        else:
-            prompt = None
-            if non_system_messages:
-                prompt = non_system_messages[-1]["content"]
-            system_prompt = system_messages[0]["content"] if system_messages else None
-            chat_history = non_system_messages[:-1]  # exclude the prompt
-
         model_uid = body.model
 
         try:
@@ -1681,9 +1653,7 @@ async def create_chat_completion(self, request: Request) -> Response:
         from ..model.llm.utils import GLM4_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY
 
         model_family = desc.get("model_family", "")
-        function_call_models = (
-            ["gorilla-openfunctions-v1"] + QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY
-        )
+        function_call_models = QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY
 
         if model_family not in function_call_models:
             if body.tools:
@@ -1716,9 +1686,7 @@ async def stream_results():
                 try:
                     try:
                         iterator = await model.chat(
-                            prompt,
-                            system_prompt,
-                            chat_history,
+                            messages,
                             kwargs,
                             raw_params=raw_kwargs,
                         )
@@ -1750,9 +1718,7 @@ async def stream_results():
         else:
             try:
                 data = await model.chat(
-                    prompt,
-                    system_prompt,
-                    chat_history,
+                    messages,
                     kwargs,
                     raw_params=raw_kwargs,
                 )
diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
index 679f65d296..5958ca4134 100644
--- a/xinference/client/restful/restful_client.py
+++ b/xinference/client/restful/restful_client.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import json
 import typing
-import warnings
 from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
 
 import requests
@@ -470,9 +469,7 @@ def generate(
 class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
     def chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List["ChatCompletionMessage"]] = None,
+        messages: List[Dict],
         tools: Optional[List[Dict]] = None,
         generate_config: Optional[
             Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]
@@ -483,11 +480,7 @@ def chat(
 
         Parameters
         ----------
-        prompt: str
-            The user's input.
-        system_prompt: Optional[str]
-            The system context provide to Model prior to any chats.
-        chat_history: Optional[List["ChatCompletionMessage"]]
+        messages: List[Dict]
             A list of messages comprising the conversation so far.
         tools: Optional[List[Dict]]
             A tool list.
@@ -509,25 +502,11 @@ def chat(
             Report the failure to generate the chat from the server. Detailed information provided in error message.
 
         """
-        warnings.warn(
-            "The parameters `prompt`, `system_prompt` and `chat_history` will be deprecated in version v0.15.0, "
-            "and will be replaced by the parameter `messages`, "
-            "similar to the OpenAI API: https://platform.openai.com/docs/guides/chat-completions/getting-started",
-            category=DeprecationWarning,
-            stacklevel=2,
-        )
-
         url = f"{self._base_url}/v1/chat/completions"
 
-        if chat_history is None:
-            chat_history = []
-
-        chat_history = handle_system_prompts(chat_history, system_prompt)
-        chat_history.append({"role": "user", "content": prompt})  # type: ignore
-
         request_body: Dict[str, Any] = {
             "model": self._model_uid,
-            "messages": chat_history,
+            "messages": messages,
         }
         if tools is not None:
             request_body["tools"] = tools
diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py
index 095ef5e182..e6bd554129 100644
--- a/xinference/client/tests/test_client.py
+++ b/xinference/client/tests/test_client.py
@@ -73,12 +73,13 @@ def test_RESTful_client(setup):
     with pytest.raises(RuntimeError):
         completion = model.chat({"max_tokens": 64})
 
-    completion = model.chat("What is the capital of France?")
+    messages = {"role": "user", "content": "What is the capital of France?"}
+    completion = model.chat(messages)
     assert "content" in completion["choices"][0]["message"]
 
     def _check_stream():
         streaming_response = model.chat(
-            prompt="What is the capital of France?",
+            messages,
             generate_config={"stream": True, "max_tokens": 5},
         )
         for chunk in streaming_response:
diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py
index 8738141f90..9de2dab252 100644
--- a/xinference/core/chat_interface.py
+++ b/xinference/core/chat_interface.py
@@ -16,7 +16,7 @@
 import logging
 import os
 from io import BytesIO
-from typing import Generator, List, Optional
+from typing import Dict, Generator, List, Optional
 
 import gradio as gr
 import PIL.Image
@@ -27,7 +27,6 @@
     RESTfulChatModelHandle,
     RESTfulGenerateModelHandle,
 )
-from ..types import ChatCompletionMessage
 
 logger = logging.getLogger(__name__)
 
@@ -96,11 +95,11 @@ def flatten(matrix: List[List[str]]) -> List[str]:
                 flat_list += row
             return flat_list
 
-        def to_chat(lst: List[str]) -> List[ChatCompletionMessage]:
+        def to_chat(lst: List[str]) -> List[Dict]:
             res = []
             for i in range(len(lst)):
                 role = "assistant" if i % 2 == 1 else "user"
-                res.append(ChatCompletionMessage(role=role, content=lst[i]))
+                res.append(dict(role=role, content=lst[i]))
             return res
 
         def generate_wrapper(
@@ -116,11 +115,12 @@ def generate_wrapper(
             client._set_token(self._access_token)
             model = client.get_model(self.model_uid)
             assert isinstance(model, RESTfulChatModelHandle)
+            messages = to_chat(flatten(history))
+            messages.append(dict(role="user", content=message))
 
             response_content = ""
             for chunk in model.chat(
-                prompt=message,
-                chat_history=to_chat(flatten(history)),
+                messages,
                 generate_config={
                     "max_tokens": int(max_tokens),
                     "temperature": temperature,
@@ -191,15 +191,10 @@ def predict(history, bot, max_tokens, temperature, stream):
             model = client.get_model(self.model_uid)
             assert isinstance(model, RESTfulChatModelHandle)
 
-            prompt = history[-1]
-            assert prompt["role"] == "user"
-            prompt = prompt["content"]
-            # multimodal chat does not support stream.
             if stream:
                 response_content = ""
                 for chunk in model.chat(
-                    prompt=prompt,
-                    chat_history=history[:-1],
+                    messages=history,
                     generate_config={
                         "max_tokens": max_tokens,
                         "temperature": temperature,
@@ -224,8 +219,7 @@ def predict(history, bot, max_tokens, temperature, stream):
                 yield history, bot
             else:
                 response = model.chat(
-                    prompt=prompt,
-                    chat_history=history[:-1],
+                    messages=history,
                     generate_config={
                         "max_tokens": max_tokens,
                         "temperature": temperature,
diff --git a/xinference/core/model.py b/xinference/core/model.py
index 4b08a4e9a8..10ab759fe6 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -439,9 +439,7 @@ async def _call_wrapper(self, output_type: str, fn: Callable, *args, **kwargs):
     @log_async(logger=logger)
     async def generate(self, prompt: str, *args, **kwargs):
         if self.allow_batching():
-            return await self.handle_batching_request(
-                prompt, "generate", *args, **kwargs
-            )
+            return await self.handle_batching_request(prompt, *args, **kwargs)
         else:
             kwargs.pop("raw_params", None)
             if hasattr(self._model, "generate"):
@@ -481,22 +479,27 @@ async def _queue_consumer(
                 yield res
 
     @staticmethod
-    def _get_stream_from_args(ability: str, *args) -> bool:
-        if ability == "chat":
-            assert args[2] is None or isinstance(args[2], dict)
-            return False if args[2] is None else args[2].get("stream", False)
-        else:
-            assert args[0] is None or isinstance(args[0], dict)
-            return False if args[0] is None else args[0].get("stream", False)
+    def _get_stream_from_args(*args) -> bool:
+        assert args[0] is None or isinstance(args[0], dict)
+        return False if args[0] is None else args[0].get("stream", False)
 
-    async def handle_batching_request(self, prompt: str, ability: str, *args, **kwargs):
-        stream = self._get_stream_from_args(ability, *args)
+    async def handle_batching_request(
+        self, prompt_or_messages: Union[str, List[Dict]], *args, **kwargs
+    ):
+        """
+        The input parameter `prompt_or_messages`:
+        - when the model_ability is `generate`, it's `prompt`, which is str type.
+        - when the model_ability is `chat`, it's `messages`, which is List[Dict] type.
+        """
+        stream = self._get_stream_from_args(*args)
         assert self._scheduler_ref is not None
         if stream:
             assert self._scheduler_ref is not None
             queue: Queue[Any] = Queue()
             ret = self._queue_consumer(queue)
-            await self._scheduler_ref.add_request(prompt, queue, *args, **kwargs)
+            await self._scheduler_ref.add_request(
+                prompt_or_messages, queue, *args, **kwargs
+            )
             gen = self._to_async_gen("json", ret)
             self._current_generator = weakref.ref(gen)
             return gen
@@ -505,7 +508,9 @@ async def handle_batching_request(self, prompt: str, ability: str, *args, **kwar
 
             assert self._loop is not None
             future = ConcurrentFuture()
-            await self._scheduler_ref.add_request(prompt, future, *args, **kwargs)
+            await self._scheduler_ref.add_request(
+                prompt_or_messages, future, *args, **kwargs
+            )
             fut = asyncio.wrap_future(future, loop=self._loop)
             result = await fut
             if result == XINFERENCE_NON_STREAMING_ABORT_FLAG:
@@ -517,24 +522,22 @@ async def handle_batching_request(self, prompt: str, ability: str, *args, **kwar
     @request_limit
     @xo.generator
     @log_async(logger=logger)
-    async def chat(self, prompt: str, *args, **kwargs):
+    async def chat(self, messages: List[Dict], *args, **kwargs):
         start_time = time.time()
         response = None
         try:
             if self.allow_batching():
-                return await self.handle_batching_request(
-                    prompt, "chat", *args, **kwargs
-                )
+                return await self.handle_batching_request(messages, *args, **kwargs)
             else:
                 kwargs.pop("raw_params", None)
                 if hasattr(self._model, "chat"):
                     response = await self._call_wrapper_json(
-                        self._model.chat, prompt, *args, **kwargs
+                        self._model.chat, messages, *args, **kwargs
                     )
                     return response
                 if hasattr(self._model, "async_chat"):
                     response = await self._call_wrapper_json(
-                        self._model.async_chat, prompt, *args, **kwargs
+                        self._model.async_chat, messages, *args, **kwargs
                     )
                     return response
                 raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
diff --git a/xinference/core/scheduler.py b/xinference/core/scheduler.py
index 6b28f70259..842b8bd737 100644
--- a/xinference/core/scheduler.py
+++ b/xinference/core/scheduler.py
@@ -18,7 +18,7 @@
 import uuid
 from collections import deque
 from enum import Enum
-from typing import List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 import xoscar as xo
 
@@ -37,9 +37,11 @@ class AbortRequestMessage(Enum):
 
 
 class InferenceRequest:
-    def __init__(self, prompt, future_or_queue, is_prefill, *args, **kwargs):
-        # original prompt
-        self._prompt = prompt
+    def __init__(
+        self, prompt_or_messages, future_or_queue, is_prefill, *args, **kwargs
+    ):
+        # original prompt, prompt(str) for generate model and messages(List[Dict]) for chat model
+        self._prompt = prompt_or_messages
         # full prompt that contains chat history and applies chat template
         self._full_prompt = None
         # whether the current request is in the prefill phase
@@ -88,29 +90,17 @@ def __init__(self, prompt, future_or_queue, is_prefill, *args, **kwargs):
         self._check_args()
 
     def _check_args(self):
-        # chat
-        if len(self._inference_args) == 3:
-            # system prompt
-            assert self._inference_args[0] is None or isinstance(
-                self._inference_args[0], str
-            )
-            # chat history
-            assert self._inference_args[1] is None or isinstance(
-                self._inference_args[1], list
-            )
-            # generate config
-            assert self._inference_args[2] is None or isinstance(
-                self._inference_args[2], dict
-            )
-        else:  # generate
-            assert len(self._inference_args) == 1
-            # generate config
-            assert self._inference_args[0] is None or isinstance(
-                self._inference_args[0], dict
-            )
+        assert len(self._inference_args) == 1
+        # generate config
+        assert self._inference_args[0] is None or isinstance(
+            self._inference_args[0], dict
+        )
 
     @property
     def prompt(self):
+        """
+        prompt for generate model and messages for chat model
+        """
         return self._prompt
 
     @property
@@ -162,11 +152,7 @@ def append_new_token(self, token: int):
 
     @property
     def generate_config(self):
-        return (
-            self._inference_args[2]
-            if len(self._inference_args) == 3
-            else self._inference_args[0]
-        )
+        return self._inference_args[0]
 
     @property
     def sanitized_generate_config(self):
@@ -423,8 +409,16 @@ async def step(self):
 
         self._empty_cache()
 
-    async def add_request(self, prompt: str, future_or_queue, *args, **kwargs):
-        req = InferenceRequest(prompt, future_or_queue, True, *args, **kwargs)
+    async def add_request(
+        self,
+        prompt_or_messages: Union[str, List[Dict]],
+        future_or_queue,
+        *args,
+        **kwargs,
+    ):
+        req = InferenceRequest(
+            prompt_or_messages, future_or_queue, True, *args, **kwargs
+        )
         rid = req.request_id
         if rid is not None:
             if rid in self._id_to_req:
diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index a18d926c7b..61fc4caa8e 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -310,10 +310,7 @@ async def get_cluster_device_info(self, detailed: bool = False) -> List:
     async def get_builtin_prompts() -> Dict[str, Any]:
         from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
 
-        data = {}
-        for k, v in BUILTIN_LLM_PROMPT_STYLE.items():
-            data[k] = v.dict()
-        return data
+        return {k: v for k, v in BUILTIN_LLM_PROMPT_STYLE.items()}
 
     @staticmethod
     async def get_builtin_families() -> Dict[str, List[str]]:
diff --git a/xinference/core/tests/test_continuous_batching.py b/xinference/core/tests/test_continuous_batching.py
index f6db0362cf..c58b91bb55 100644
--- a/xinference/core/tests/test_continuous_batching.py
+++ b/xinference/core/tests/test_continuous_batching.py
@@ -48,7 +48,7 @@ def join(self, timeout=None):
 class InferenceThread(BaseThread):
     def __init__(self, prompt, generate_config, client, model):
         super().__init__()
-        self._prompt = prompt
+        self._prompt = [{"role": "user", "content": prompt}]
         self._generate_config = generate_config
         self._client = client
         self._model = model
@@ -159,11 +159,12 @@ def test_continuous_batching(enable_batch, setup):
     thread2.join()
 
     # test error generate config
+    messages = [{"role": "user", "content": "你好"}]
     with pytest.raises(RuntimeError):
-        model.chat("你好", generate_config={"max_tokens": 99999999999999999})
+        model.chat(messages, generate_config={"max_tokens": 99999999999999999})
 
     with pytest.raises(RuntimeError):
-        model.chat("你好", generate_config={"stream_interval": 0})
+        model.chat(messages, generate_config={"stream_interval": 0})
 
     # test error with other correct requests
     thread1 = InferenceThread("1+1=3正确吗？", {"stream": True}, client, model)
diff --git a/xinference/core/tests/test_metrics.py b/xinference/core/tests/test_metrics.py
index 0004c5932f..4bcd2c3bbd 100644
--- a/xinference/core/tests/test_metrics.py
+++ b/xinference/core/tests/test_metrics.py
@@ -140,7 +140,8 @@ async def test_metrics_exporter_data(setup_cluster):
     )
 
     model = client.get_model(model_uid)
-    response = model.chat("write a poem.")
+    messages = [{"role": "user", "content": "write a poem."}]
+    response = model.chat(messages)
 
     response = requests.get(metrics_exporter_address)
     assert response.ok
diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
index cd47b98cc5..0c50eb256d 100644
--- a/xinference/core/tests/test_restful_api.py
+++ b/xinference/core/tests/test_restful_api.py
@@ -526,7 +526,8 @@ def test_restful_api_for_tool_calls(setup, model_format, quantization):
 
     client = RESTfulClient(endpoint)
     model = client.get_model(model_uid_res)
-    completion = model.chat("帮我查询股票10111的价格", tools=tools)
+    messages = [{"role": "user", "content": "帮我查询股票10111的价格"}]
+    completion = model.chat(messages, tools=tools)
     assert "content" in completion["choices"][0]["message"]
     assert "tool_calls" == completion["choices"][0]["finish_reason"]
     assert (
diff --git a/xinference/core/tests/test_types.py b/xinference/core/tests/test_types.py
index 8dd3fdbd63..bfcd9d89dd 100644
--- a/xinference/core/tests/test_types.py
+++ b/xinference/core/tests/test_types.py
@@ -82,9 +82,6 @@ def test_create_chat_completion_types():
     with pytest.raises(ValidationError):
         CreateChatCompletion(model="abc", not_exist="jdk")
 
-    # with pytest.raises(pydantic.ValidationError):
-    #     CreateChatCompletion(model="abc", messages=[{"role": "invalid"}])
-
     CreateChatCompletion(model="abc", messages=[{"role": "tool"}], max_tokens=None)
 
     types = [CreateChatCompletionTorch, CreateChatCompletionLlamaCpp]
diff --git a/xinference/deploy/cmdline.py b/xinference/deploy/cmdline.py
index 8eea848077..f0f09720a5 100644
--- a/xinference/deploy/cmdline.py
+++ b/xinference/deploy/cmdline.py
@@ -17,7 +17,7 @@
 import os
 import sys
 import warnings
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import click
 from xoscar.utils import get_next_port
@@ -38,7 +38,6 @@
     XINFERENCE_LOG_MAX_BYTES,
 )
 from ..isolation import Isolation
-from ..types import ChatCompletionMessage
 from .utils import (
     get_config_dict,
     get_log_file,
@@ -1210,13 +1209,12 @@ def model_chat(
     stream: bool,
     api_key: Optional[str],
 ):
-    # TODO: chat model roles may not be user and assistant.
     endpoint = get_endpoint(endpoint)
     client = RESTfulClient(base_url=endpoint, api_key=api_key)
     if api_key is None:
         client._set_token(get_stored_token(endpoint, client))
 
-    chat_history: "List[ChatCompletionMessage]" = []
+    messages: List[Dict] = []
     if stream:
         # TODO: when stream=True, RestfulClient cannot generate words one by one.
         # So use Client in temporary. The implementation needs to be changed to
@@ -1229,10 +1227,10 @@ async def chat_internal():
                 if prompt == "":
                     break
                 print("Assistant: ", end="", file=sys.stdout)
+                messages.append(dict(role="user", content=prompt))
                 response_content = ""
                 for chunk in model.chat(
-                    prompt=prompt,
-                    chat_history=chat_history,
+                    messages,
                     generate_config={"stream": stream, "max_tokens": max_tokens},
                 ):
                     delta = chunk["choices"][0]["delta"]
@@ -1242,10 +1240,7 @@ async def chat_internal():
                         response_content += delta["content"]
                         print(delta["content"], end="", flush=True, file=sys.stdout)
                 print("", file=sys.stdout)
-                chat_history.append(ChatCompletionMessage(role="user", content=prompt))
-                chat_history.append(
-                    ChatCompletionMessage(role="assistant", content=response_content)
-                )
+                messages.append(dict(role="assistant", content=response_content))
 
         model = client.get_model(model_uid=model_uid)
 
@@ -1274,20 +1269,17 @@ async def chat_internal():
             prompt = input("User: ")
             if prompt == "":
                 break
-            chat_history.append(ChatCompletionMessage(role="user", content=prompt))
+            messages.append({"role": "user", "content": prompt})
             print("Assistant: ", end="", file=sys.stdout)
             response = restful_model.chat(
-                prompt=prompt,
-                chat_history=chat_history,
+                messages,
                 generate_config={"stream": stream, "max_tokens": max_tokens},
             )
             if not isinstance(response, dict):
                 raise ValueError("chat result is not valid")
             response_content = response["choices"][0]["message"]["content"]
             print(f"{response_content}\n", file=sys.stdout)
-            chat_history.append(
-                ChatCompletionMessage(role="assistant", content=response_content)
-            )
+            messages.append(dict(role="assistant", content=response_content))
 
 
 @cli.command("vllm-models", help="Query and display models compatible with vLLM.")
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index fc63c0b27a..ae9f9c4d55 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -45,7 +45,6 @@
     LLMFamilyV1,
     LLMSpecV1,
     MLXLLMSpecV1,
-    PromptStyleV1,
     PytorchLLMSpecV1,
     get_cache_status,
     get_user_defined_llm_families,
@@ -141,7 +140,6 @@ def _install():
     from .transformers.glm4v import Glm4VModel
     from .transformers.intern_vl import InternVLChatModel
     from .transformers.internlm2 import Internlm2PytorchChatModel
-    from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
     from .transformers.minicpmv25 import MiniCPMV25Model
     from .transformers.minicpmv26 import MiniCPMV26Model
     from .transformers.qwen_vl import QwenVLChatModel
@@ -170,8 +168,6 @@ def _install():
     TRANSFORMERS_CLASSES.extend(
         [
             ChatglmPytorchChatModel,
-            LlamaPytorchModel,
-            LlamaPytorchChatModel,
             PytorchChatModel,
             Internlm2PytorchChatModel,
             QwenVLChatModel,
@@ -204,13 +200,17 @@ def _install():
         model_spec = LLMFamilyV1.parse_obj(json_obj)
         BUILTIN_LLM_FAMILIES.append(model_spec)
 
-        # register prompt style
+        # register chat_template
         if "chat" in model_spec.model_ability and isinstance(
-            model_spec.prompt_style, PromptStyleV1
+            model_spec.chat_template, str
         ):
             # note that the key is the model name,
             # since there are multiple representations of the same prompt style name in json.
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
@@ -230,10 +230,14 @@ def _install():
         # if duplicated with huggingface json, keep it as the huggingface style
         if (
             "chat" in model_spec.model_ability
-            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and isinstance(model_spec.chat_template, str)
             and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
         ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
@@ -253,10 +257,14 @@ def _install():
         # if duplicated with huggingface json, keep it as the huggingface style
         if (
             "chat" in model_spec.model_ability
-            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and isinstance(model_spec.chat_template, str)
             and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
         ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index b820fce466..30a835ff7c 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -14,12 +14,11 @@
 import logging
 import os
 import time
-from typing import Iterable, Iterator, List, Optional, Union
+from typing import Dict, Iterator, List, Optional, Union
 
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChunk,
     CompletionUsage,
@@ -181,10 +180,12 @@ def generator_wrapper(
             for index, _completion_chunk in enumerate(
                 self._llm(prompt=_prompt, **_generate_config)
             ):
+                _completion_chunk["model"] = self.model_uid
                 request_id = _completion_chunk["id"]
                 choice = _completion_chunk["choices"][0]
                 if choice["finish_reason"] is not None:
                     completion_tokens = index
+                    choice.pop("text", None)
                 total_tokens = prompt_tokens + completion_tokens
                 _completion_chunk["usage"] = CompletionUsage(
                     prompt_tokens=total_tokens,
@@ -262,39 +263,25 @@ def _sanitize_generate_config(
         self, generate_config: Optional[LlamaCppGenerateConfig]
     ) -> LlamaCppGenerateConfig:
         generate_config = super()._sanitize_generate_config(generate_config)
-        if self.model_family.prompt_style and self.model_family.prompt_style.stop:
-            generate_config["stop"] = self.model_family.prompt_style.stop
+        if self.model_family.stop and self.model_family.stop:
+            generate_config["stop"] = self.model_family.stop.copy()
         return generate_config
 
     def chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[LlamaCppGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-
-        chat_history = chat_history or []
-        assert prompt_style is not None
+        model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
+        full_context_kwargs = {}
+        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
+            full_context_kwargs["tools"] = tools
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
+        )
 
         generate_config = self._sanitize_generate_config(generate_config)
-        # TODO(codingl2k1): qwen hacky to set stop for function call.
-        model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            stop = generate_config.get("stop")
-            if isinstance(stop, str):
-                generate_config["stop"] = [stop, "Observation:"]
-            elif isinstance(stop, Iterable):
-                assert not isinstance(stop, str)
-                generate_config["stop"] = stop + ["Observation:"]  # type: ignore
-            else:
-                generate_config["stop"] = "Observation:"
 
         stream = generate_config.get("stream", False)
         if stream:
@@ -305,7 +292,5 @@ def chat(
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, c, tools
-                )
+                return self._tool_calls_completion(self.model_family, self.model_uid, c)
             return self._to_chat_completion(c)
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 26f1d599a8..198123430b 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -46,24 +46,15 @@
         "model_revision": "3cb06f589b7b1e2f8e728c77280b1114191d24de"
       }
     ],
-    "prompt_style": {
-      "style_name": "CodeShell",
-      "system_prompt": "",
-      "roles": [
-        "## human:",
-        "## assistant: "
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        70000
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "|||",
-        "|<end>|"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if item['role'] == 'user' %}{{ '## human: ' + item['content'] + '|<end>|' }}{% elif item['role'] == 'assistant' %}{{ '## assistant: ' + item['content'] + '|<end>|' }}{% endif %}{% endfor %}{{ '## assistant: ' }}",
+    "stop_token_ids": [
+      70000
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "|||",
+      "|<end>|"
+    ]
   },
   {
     "version": 1,
@@ -134,26 +125,17 @@
         "model_revision": "ebee18c488086b396dde649f2aa6548b9b8d2404"
       }
     ],
-    "prompt_style": {
-      "style_name": "PHI3",
-      "system_prompt": "You are a helpful AI assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "inter_message_sep": "<|end|>\n",
-      "stop_token_ids":[
-        32000,
-        32001,
-        32007
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|assistant|>",
-        "<|end|>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}",
+    "stop_token_ids":[
+      32000,
+      32001,
+      32007
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|assistant|>",
+      "<|end|>"
+    ]
   },
   {
     "version": 1,
@@ -189,156 +171,17 @@
         "model_revision": "b86bcaf57ea4dfdec5dbe12a377028b2fab0d480"
       }
     ],
-    "prompt_style": {
-      "style_name": "PHI3",
-      "system_prompt": "You are a helpful AI assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "inter_message_sep": "<|end|>\n",
-      "stop_token_ids":[
-        32000,
-        32001,
-        32007
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|assistant|>",
-        "<|end|>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "chatglm3",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat",
-      "tools"
-    ],
-    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "THUDM/chatglm3-6b",
-        "model_revision": "103caa40027ebfd8450289ca2f278eac4ff26405"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        64795,
-        64797,
-        2
-      ],
-      "stop": [
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 32768,
-    "model_name": "chatglm3-32k",
-    "model_lang": [
-      "en",
-      "zh"
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}",
+    "stop_token_ids":[
+      32000,
+      32001,
+      32007
     ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "THUDM/chatglm3-6b-32k",
-        "model_revision": "339f17ff464d47b5077527c2b34e80a7719ede3e"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        64795,
-        64797,
-        2
-      ],
-      "stop": [
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 131072,
-    "model_name": "chatglm3-128k",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "THUDM/chatglm3-6b-128k",
-        "model_revision": "f0afbe671009abc9e31182170cf60636d5546cda"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        64795,
-        64797,
-        2
-      ],
-      "stop": [
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "stop": [
+      "<|endoftext|>",
+      "<|assistant|>",
+      "<|end|>"
+    ]
   },
   {
     "version": 1,
@@ -363,7 +206,7 @@
           "none"
         ],
         "model_id": "THUDM/glm-4-9b-chat",
-        "model_revision": "aae8bd74af5c6dff63a49d7fbdcc89349ebf87aa"
+        "model_revision": "f6e0743b285dd808084530f070ad08e504386750"
       },
       {
         "model_format": "ggufv2",
@@ -392,24 +235,17 @@
         "model_revision": "0155a14edf0176863e9a003cdd78ce599e4d62c0"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时，该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出，或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中，`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用，这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数：\n`search(query: str, recency_days: int)`：使用搜索引擎进行查询并显示结果，可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`：获取一系列指定 id 的页面内容。每次调用时，须选择3-10个页面。选择多个角度的页面，同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的，你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`：打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤：1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL，也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息，也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述，你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述，规则：\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求，需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明，所在地为中国，持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -463,24 +299,17 @@
         "model_revision": "782e28bd5eee3c514c07108da15e0b5e06dcf776"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时，该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出，或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中，`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用，这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数：\n`search(query: str, recency_days: int)`：使用搜索引擎进行查询并显示结果，可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`：获取一系列指定 id 的页面内容。每次调用时，须选择3-10个页面。选择多个角度的页面，同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的，你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`：打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤：1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL，也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息，也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述，你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述，规则：\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求，需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明，所在地为中国，持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -505,27 +334,20 @@
           "none"
         ],
         "model_id": "THUDM/glm-4v-9b",
-        "model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4"
+        "model_revision": "01328faefe122fe605c1c127b62e6031d3ffebf7"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -567,24 +389,17 @@
         "model_revision": "6a04071c54c943949826d4815ee00717ed8cf153"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|>\n' + item['content'] }}{% elif loop.first %}{{ '<|system|>\n你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -622,14 +437,13 @@
         "model_revision": "1e4944aaa1d8c8d0cdca28bb8e3a003303d0781b"
       }
     ],
-    "prompt_style": {
-      "style_name": "XVERSE",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|> \n' + item['content'] }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|> \n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|> \n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}",
+    "stop_token_ids": [
+      3
+    ],
+    "stop": [
+      "<|endoftext|>"
+    ]
   },
   {
     "version": 1,
@@ -842,22 +656,11 @@
         "model_revision": "36d9a7388cc80e5f4b3e9701ca2f250d21a96c30"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "<s>[INST] <<SYS>>\nYou are a helpful AI assistant.\n<</SYS>>\n\n",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": " </s><s>",
-      "stop_token_ids": [
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
         2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    ],
+    "stop": []
   },
   {
     "version": 1,
@@ -1210,24 +1013,15 @@
         "model_id": "TechxGenus/Meta-Llama-3-70B-Instruct-GPTQ"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -1505,24 +1299,15 @@
         "model_id": "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{{- '<|begin_of_text|>' }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -1558,8 +1343,7 @@
       "zh"
     ],
     "model_ability": [
-      "chat",
-      "tools"
+      "chat"
     ],
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
@@ -1662,25 +1446,17 @@
         "model_id": "Qwen/Qwen-72B-Chat-{quantization}"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|im_start|>system\n' + item['content'] + '<|im_end|>\n' }}{% elif loop.first %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|im_start|>user\n' + item['content'] + '<|im_end|>' }}{% elif item['role'] == 'assistant' %}{{ '<|im_start|>assistant\n' + item['content'] + '<|im_end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2025,25 +1801,17 @@
         }
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2078,25 +1846,17 @@
         "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2171,25 +1931,17 @@
         "model_id": "Qwen/CodeQwen1.5-7B-Chat-AWQ"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2479,25 +2231,17 @@
         }
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2560,25 +2304,17 @@
         }
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2623,19 +2359,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "WizardLM/WizardMath-7B-V1.0",
-        "model_revision": "3c3a3b33334f4b35344b22c5c7465957ee7b2c75"
-      },
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 13,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "WizardLM/WizardMath-13B-V1.0",
-        "model_revision": "ef95532e96e634c634992dab891a17032dc71c8d"
+        "model_id": "WizardLMTeam/WizardMath-7B-V1.0",
+        "model_revision": "825a586f260d6c583b8aa9ceab6cdfaa3d9a4ddc"
       },
       {
         "model_format": "pytorch",
@@ -2645,19 +2370,17 @@
           "8-bit",
           "none"
         ],
-        "model_id": "WizardLM/WizardMath-70B-V1.0",
-        "model_revision": "e089c3f9d2ad9d1acb62425aec3f4126f498f4c5"
+        "model_id": "WizardLMTeam/WizardMath-70B-V1.0",
+        "model_revision": "4dd9f3fcd8c056561d67ec59ae011f7c146aebd2"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE_COT",
-      "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-      "roles": [
-        "Instruction",
-        "Response"
-      ],
-      "intra_message_sep": "\n\n### "
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -2979,22 +2702,13 @@
         "model_file_name_template": "codellama-34b-instruct.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "<s>[INST] <<SYS>>\nWrite code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n<</SYS>>\n\n",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": " </s><s>",
-      "stop_token_ids": [
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
         2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -3032,20 +2746,12 @@
         "model_revision": "a56c793eb7a721ab6c270f779024e0375e8afd4a"
       }
     ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "",
-      "roles": [
-        "<reserved_106>",
-        "<reserved_107>"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "</s>",
-      "stop_token_ids": [
-        2,
-        195
-      ]
-    }
+    "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}\n\n{% for message in messages %}\n{% if message['role'] == 'user' %}\n<reserved_106>\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% elif message['role'] == 'assistant' %}\n<reserved_107>\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n<reserved_107>\n{% endif %}",
+    "stop_token_ids": [
+      2,
+      195
+    ],
+    "stop": []
   },
   {
     "version": 1,
@@ -3189,22 +2895,13 @@
         "model_file_name_template": "mistral-7b-instruct-v0.1.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "[INST] ",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": "<s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + '</s>'}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -3266,22 +2963,13 @@
         "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "[INST] ",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": "<s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + '</s>'}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -3342,22 +3030,13 @@
         "model_file_name_template": "Mistral-7B-Instruct-v0.3.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "[INST] ",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": "<s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + '</s>' }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + '</s>'}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -3466,22 +3145,13 @@
         "model_id": "mlx-community/Mistral-Nemo-Instruct-2407-8bit"
       }
     ],
-    "prompt_style": {
-      "style_name": "mistral-nemo",
-      "system_prompt": "",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "</s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + '</s>' }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + '</s>'}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -3623,22 +3293,13 @@
         "model_id": "mlx-community/Mistral-Large-Instruct-2407-8bit"
       }
     ],
-    "prompt_style": {
-      "style_name": "mistral-nemo",
-      "system_prompt": "",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "</s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + '</s>' }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + '</s>'}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -3745,22 +3406,13 @@
         "model_file_name_template": "openhermes-2.5-mistral-7b.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "<|im_start|>system\nYou are 'Hermes 2.5', a conscious sentient superintelligent artificial intelligence, your purpose is to assist the user with their requests.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "<|im_end|>\n<|im_start|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        32000
-      ],
-      "stop": [
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      32000
+    ],
+    "stop": [
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -3909,16 +3561,13 @@
         "model_file_name_template": "mixtral-8x7b-instruct-v0.1.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "MIXTRAL_V01",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": ""
-    }
+    "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + '</s>'}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4045,16 +3694,13 @@
         }
       }
     ],
-    "prompt_style": {
-      "style_name": "MIXTRAL_V01",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": ""
-    }
+    "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + '</s>' }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + '</s>'}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4225,28 +3871,19 @@
         "model_file_name_template": "yi-34b-chat.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -4494,28 +4131,19 @@
         "model_revision": "3c12761a2c6663f216caab6dff84b0dd29b472ac"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -4593,28 +4221,19 @@
         "model_file_name_template": "Yi-1.5-34B-Chat-16K-{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -4627,17 +4246,6 @@
       "chat"
     ],
     "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "WizardLM/WizardCoder-Python-7B-V1.0",
-        "model_revision": "e40673a27a4aefcff2c6d2b3b1e0681a38703e4e"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 13,
@@ -4646,8 +4254,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "WizardLM/WizardCoder-Python-13B-V1.0",
-        "model_revision": "d920d26e2108377de0f676a3c4be666f5212f4a1"
+        "model_id": "WizardLMTeam/WizardCoder-Python-13B-V1.0",
+        "model_revision": "5ac6748b1f5a4c282107ddc7d3b69fdc4a686d75"
       },
       {
         "model_format": "pytorch",
@@ -4657,8 +4265,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "WizardLM/WizardCoder-Python-34B-V1.0",
-        "model_revision": "d869ce178715f8d6e8141e2ed50e6290985eedb0"
+        "model_id": "WizardLMTeam/WizardCoder-Python-34B-V1.0",
+        "model_revision": "897fc6d9e12136c68c441b2350d015902c144b20"
       },
       {
         "model_format": "ggufv2",
@@ -4721,157 +4329,13 @@
         "model_file_name_template": "wizardcoder-python-34b-v1.0.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE",
-      "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-      "roles": [
-        "Instruction",
-        "Response"
-      ],
-      "intra_message_sep": "\n\n### ",
-      "stop": [
-        "</s>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "zephyr-7b-alpha",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Zephyr-7B-α is the first model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "HuggingFaceH4/zephyr-7b-alpha",
-        "model_revision": "f28e1c0e5a1af475bcd7bdf6554e69abc6c0c7ee"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "<|system|>\nYou are a friendly chatbot.</s>\n",
-      "roles": [
-        "<|user|>\n",
-        "<|assistant|>\n"
-      ],
-      "intra_message_sep": "</s>\n",
-      "inter_message_sep": "</s>\n",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "zephyr-7b-beta",
-    "model_lang": [
-      "en"
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}",
+    "stop_token_ids": [
+      2
     ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "HuggingFaceH4/zephyr-7b-beta",
-        "model_revision": "3bac358730f8806e5c3dc7c7e19eb36e045bf720"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "<|system|>\nYou are a friendly chatbot.</s>\n",
-      "roles": [
-        "<|user|>\n",
-        "<|assistant|>\n"
-      ],
-      "intra_message_sep": "</s>\n",
-      "inter_message_sep": "</s>\n",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 4096,
-    "model_name": "gorilla-openfunctions-v1",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "OpenFunctions is designed to extend Large Language Model (LLM) Chat Completion feature to formulate executable APIs call given natural language instructions and API context.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "gorilla-llm/gorilla-openfunctions-v1",
-        "model_revision": "74615f614ee845eab114e71541fd5098d1709958"
-      },
-      {
-        "model_format": "ggufv2",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "Q2_K",
-          "Q3_K_L",
-          "Q3_K_M",
-          "Q3_K_S",
-          "Q4_0",
-          "Q4_K_M",
-          "Q4_K_S",
-          "Q5_0",
-          "Q5_K_M",
-          "Q5_K_S",
-          "Q6_K",
-          "Q8_0"
-        ],
-        "model_id": "TheBloke/gorilla-openfunctions-v1-GGUF",
-        "model_file_name_template": "gorilla-openfunctions-v1.{quantization}.gguf"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "GORILLA_OPENFUNCTIONS",
-      "system_prompt": "",
-      "roles": [
-        "",
-        ""
-      ],
-      "intra_message_sep": "\n",
-      "inter_message_sep": "\n",
-      "stop_token_ids": [],
-      "stop": []
-    }
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4913,18 +4377,15 @@
         "model_file_name_template": "gorilla-openfunctions-v2.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "GORILLA_OPENFUNCTIONS",
-      "system_prompt": "",
-      "roles": [
-        "",
-        ""
-      ],
-      "intra_message_sep": "\n",
-      "inter_message_sep": "\n",
-      "stop_token_ids": [],
-      "stop": []
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{'<｜begin▁of▁sentence｜>'}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\n' + message['content'] + '\n'}}\n        {%- else %}\n{{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+    "stop_token_ids": [
+      100015,
+      100001
+    ],
+    "stop": [
+      "<|EOT|>",
+      "<｜end▁of▁sentence｜>"
+    ]
   },
   {
     "version": 1,
@@ -4959,19 +4420,13 @@
         "model_revision": "6f16f00805f45b5249f709ce21820122eeb43556"
       }
     ],
-    "prompt_style": {
-      "style_name": "DEEPSEEK_CHAT",
-      "system_prompt": "<｜begin▁of▁sentence｜>",
-      "roles": [
-        "User",
-        "Assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<｜end▁of▁sentence｜>",
-      "stop": [
-        "<｜end▁of▁sentence｜>"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   },
   {
     "version": 1,
@@ -5126,19 +4581,13 @@
         "model_file_name_template": "deepseek-llm-67b-chat.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "DEEPSEEK_CHAT",
-      "system_prompt": "<｜begin▁of▁sentence｜>",
-      "roles": [
-        "User",
-        "Assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<｜end▁of▁sentence｜>",
-      "stop": [
-        "<｜end▁of▁sentence｜>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<｜begin▁of▁sentence｜>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<｜end▁of▁sentence｜>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   },
   {
     "version": 1,
@@ -5523,18 +4972,13 @@
         "model_revision": "c40b499bac2712cd3c445cf1b05d2c6558ab0d29"
       }
     ],
-    "prompt_style": {
-      "style_name": "DEEPSEEK_CODER",
-      "system_prompt": "You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.",
-      "roles": [
-        "### Instruction:",
-        "### Response:"
-      ],
-      "inter_message_sep": "\n",
-      "stop": [
-        "<|EOT|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{'<｜begin▁of▁sentence｜>'}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\n' + message['content'] + '\n'}}\n        {%- else %}\n{{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+    "stop_token_ids": [
+      32021
+    ],
+    "stop": [
+      "<|EOT|>"
+    ]
   },
   {
     "version": 1,
@@ -5618,23 +5062,15 @@
         "model_revision": "b666125047cd98c5a7c85ca28720b44a06aed124"
       }
     ],
-    "prompt_style": {
-      "style_name": "INTERNLM2",
-      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "stop_token_ids": [
-        2,
-        92542
-      ],
-      "stop": [
-        "</s>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -5755,23 +5191,15 @@
         "model_revision": "0ec94d61d30ab161b49c69f9bf92ec2b9986d234"
       }
     ],
-    "prompt_style": {
-      "style_name": "INTERNLM2",
-      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "stop_token_ids": [
-        2,
-        92542
-      ],
-      "stop": [
-        "</s>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -5822,23 +5250,15 @@
         "model_file_name_template": "internlm2_5-7b-chat-1m-{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "INTERNLM2",
-      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "stop_token_ids": [
-        2,
-        92542
-      ],
-      "stop": [
-        "</s>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
   },
   {
     "version":1,
@@ -5873,14 +5293,13 @@
         "model_revision":"ef62bae5af34be653b9801037cd613e05ab24fdc"
       }
     ],
-    "prompt_style":{
-      "style_name":"OmniLMM",
-      "system_prompt":"The role of first msg should be user",
-      "roles":[
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version":1,
@@ -5915,14 +5334,13 @@
         "model_revision":"f92aff28552de35de3be204e8fe292dd4824e544"
       }
     ],
-    "prompt_style":{
-      "style_name":"OmniLMM",
-      "system_prompt":"The role of first msg should be user",
-      "roles":[
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+    "stop_token_ids": [
+      128001
+    ],
+    "stop": [
+      "<|end_of_text|>"
+    ]
   },
   {
     "version":1,
@@ -5957,18 +5375,15 @@
         "model_revision":"051e2df6505f1fc4305f2c9bd42ed90db8bf4874"
       }
     ],
-    "prompt_style":{
-      "style_name":"QWEN",
-      "system_prompt":"You are a helpful assistant",
-      "roles":[
-        "user",
-        "assistant"
-      ],
-      "stop": [
-        "<|im_end|>",
-        "<|endoftext|>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
   },
   {
     "version": 1,
@@ -6003,24 +5418,17 @@
         "model_revision": "5d3a5aa033ed2c502300d426c81cc5b13bcd1409"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -6055,18 +5463,17 @@
         "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
       }
     ],
-    "prompt_style": {
-      "style_name": "orion",
-      "roles": [
-        "Human",
-        "assistant"
-      ],
-      "stop": [
-        "<s>",
-        "</s>",
-        "<unk>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first %}{{ '<s>' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '</s>' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2,
+      0
+    ],
+    "stop": [
+      "<s>",
+      "</s>",
+      "<unk>"
+    ]
   },
   {
     "version": 1,
@@ -6093,18 +5500,17 @@
         "model_revision": "eba2e20808407fb431a76b90d5d506e04a0325f2"
       }
     ],
-    "prompt_style": {
-      "style_name": "orion",
-      "roles": [
-        "Human",
-        "assistant"
-      ],
-      "stop": [
-        "<s>",
-        "</s>",
-        "<unk>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first %}{{ '<s>' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '</s>' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2,
+      0
+    ],
+    "stop": [
+      "<s>",
+      "</s>",
+      "<unk>"
+    ]
   },
   {
     "version": 1,
@@ -6139,28 +5545,19 @@
         "model_revision": "ea29a9a430f27893e780366dae81d4ca5ebab561"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -6195,17 +5592,17 @@
         "model_id": "google/gemma-7b-it"
       }
     ],
-    "prompt_style": {
-      "style_name": "gemma",
-      "roles": [
-        "user",
-        "model"
-      ],
-      "stop": [
-        "<end_of_turn>",
-        "<start_of_turn>"
-      ]
-    }
+    "chat_template": "{{ '<bos>' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
+    "stop_token_ids": [
+      1,
+      106,
+      107
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
   },
   {
     "version": 1,
@@ -6385,17 +5782,17 @@
         "model_id": "mlx-community/gemma-2-27b-it-fp16"
       }
     ],
-    "prompt_style": {
-      "style_name": "gemma",
-      "roles": [
-        "user",
-        "model"
-      ],
-      "stop": [
-        "<end_of_turn>",
-        "<start_of_turn>"
-      ]
-    }
+    "chat_template": "{{ '<bos>' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
+    "stop_token_ids": [
+      1,
+      106,
+      107
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
   },
   {
     "version": 1,
@@ -6539,23 +5936,15 @@
         "model_revision": "0df19b6e10f1a19ca663f7cc1141aae10f1825f4"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE",
-      "intra_message_sep": "\n",
-      "system_prompt": "",
-      "roles": [
-        "USER",
-        "ASSISTANT"
-      ],
-      "stop_token_ids": [
-        100006,
-        100007
-      ],
-      "stop": [
-        "[CLS]",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}",
+    "stop_token_ids": [
+      100006,
+      100007
+    ],
+    "stop": [
+      "[CLS]",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -6626,23 +6015,15 @@
         "model_revision": "a06fd164c7170714924d2881c61c8348425ebc94"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE",
-      "intra_message_sep": "\n",
-      "system_prompt": "",
-      "roles": [
-        "USER",
-        "ASSISTANT"
-      ],
-      "stop_token_ids": [
-        100006,
-        100007
-      ],
-      "stop": [
-        "[CLS]",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}",
+    "stop_token_ids": [
+      100006,
+      100007
+    ],
+    "stop": [
+      "[CLS]",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -6666,22 +6047,15 @@
         "model_revision": "fe1d74027ebdd81cef5f815fa3a2d432a6b5de2a"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -6705,22 +6079,15 @@
         "model_revision": "35b90dd57d977b6e5bc4907986fa5b77aa15a82e"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -6744,22 +6111,15 @@
         "model_revision": "f4a3ba49f3f18695945c2a7c12400d4da99da498"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -6783,22 +6143,15 @@
         "model_revision": "e7a50289e4f839674cf8d4a5a2ce032ccacf64ac"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -6822,22 +6175,15 @@
         "model_revision": "b560a1593779b735a84a6daf72fba96ae38da288"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -7010,20 +6356,15 @@
         "model_revision": "35febfc08f723ac0df32480eb4af349a7d08656e"
       }
     ],
-    "prompt_style": {
-      "style_name": "c4ai-command-r",
-      "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
-      "roles": [
-        "<|USER_TOKEN|>",
-        "<|CHATBOT_TOKEN|>"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
-      "stop_token_ids": [
-        6,
-        255001
-      ]
-    }
+    "chat_template": "{{ '<BOS_TOKEN>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
+    "stop_token_ids": [
+      6,
+      255001
+    ],
+    "stop": [
+      "<EOS_TOKEN>",
+      "<|END_OF_TURN_TOKEN|>"
+    ]
   },
   {
     "version": 1,
@@ -7050,20 +6391,15 @@
         "model_revision": "1dddf3b95bc1391f6307299eb1c162c194bde9bd"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE",
-      "system_prompt": "",
-      "roles": [
-        "GPT4 Correct User",
-        "GPT4 Correct Assistant"
-      ],
-      "intra_message_sep": "<|end_of_turn|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        32000
-      ]
-    }
+    "chat_template": "ssage in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      32000
+    ],
+    "stop": [
+      "</s>",
+      "<|end_of_turn|>"
+    ]
   },
   {
     "version": 1,
@@ -7113,25 +6449,17 @@
           "model_revision": "9db32d9127cac0c85961e169d75da57a18a847b1"
         }
     ],
-    "prompt_style": {
-        "style_name": "INTERNVL",
-        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-        "roles": [
-            "<|im_start|>user",
-            "<|im_start|>assistant"
-        ],
-        "intra_message_sep": "<|im_end|>",
-        "stop_token_ids": [
-            2,
-            92543,
-            92542
-        ],
-        "stop": [
-            "</s>",
-            "<|im_end|>",
-            "<|im_start|>"
-        ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542,
+      92543
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>",
+      "<|im_start|>"
+    ]
   },
   {
     "version": 1,
@@ -7270,25 +6598,9 @@
           "model_revision": "1bc796bf80f2ebc7d6a14c15f55217a4600d50a4"
         }
     ],
-    "prompt_style": {
-        "style_name": "INTERNVL",
-        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-        "roles": [
-            "<|im_start|>user",
-            "<|im_start|>assistant"
-        ],
-        "intra_message_sep": "<|im_end|>",
-        "stop_token_ids": [
-            2,
-            92543,
-            92542
-        ],
-        "stop": [
-            "</s>",
-            "<|im_end|>",
-            "<|im_start|>"
-        ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [],
+    "stop": []
   },
   {
     "version": 1,
@@ -7323,24 +6635,15 @@
         "model_revision": "7863e362174f4718c2fe9cba4befd0b580a3194f"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -7368,24 +6671,15 @@
         "model_revision": "f375ead7d8202ebe2c3d09f1068abdddeb2929fa"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -7449,24 +6743,15 @@
         "model_id": "Tele-AI/TeleChat-52B"
       }
     ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "<_user>",
-        "<_bot>"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "",
-      "stop": [
-        "<_end>",
-        "<_start>"
-      ],
-      "stop_token_ids": [
-        160133,
-        160132
-      ]
-    }
+    "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{%- for message in messages -%}{%- if message['role'] == 'user' -%}{{- '<_user>' + message['content'] +'<_bot>' -}}{%- elif message['role'] == 'assistant' -%}{{- message['content'] + '<_end>' -}}{%- endif -%}{%- endfor -%}",
+    "stop": [
+      "<_end>",
+      "<_start>"
+    ],
+    "stop_token_ids": [
+      160133,
+      160132
+    ]
   },
   {
     "version": 1,
@@ -7513,21 +6798,12 @@
         "model_file_name_template": "csg-wukong-1B-chat-v0.1.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant's response.</s>\n",
-      "roles": [
-        "<|user|>\n",
-        "<|assistant|>\n"
-      ],
-      "intra_message_sep": "</s>\n",
-      "inter_message_sep": "</s>\n",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% elif loop.first %}{{ '<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant\\'s response.</s>\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] + '</s>\n' }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] + '</s>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
index e615a10650..555921f18f 100644
--- a/xinference/model/llm/llm_family.py
+++ b/xinference/model/llm/llm_family.py
@@ -52,7 +52,7 @@
 logger = logging.getLogger(__name__)
 
 DEFAULT_CONTEXT_LENGTH = 2048
-BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
+BUILTIN_LLM_PROMPT_STYLE: Dict[str, Dict[str, Any]] = {}
 BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
 BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
 BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
@@ -127,16 +127,6 @@ def validate_model_size_with_radix(cls, v: object) -> object:
         return v
 
 
-class PromptStyleV1(BaseModel):
-    style_name: str
-    system_prompt: str = ""
-    roles: List[str]
-    intra_message_sep: str = ""
-    inter_message_sep: str = ""
-    stop: Optional[List[str]]
-    stop_token_ids: Optional[List[int]]
-
-
 class LLMFamilyV1(BaseModel):
     version: Literal[1]
     context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
@@ -147,12 +137,12 @@ class LLMFamilyV1(BaseModel):
     # reason for not required str here: legacy registration
     model_family: Optional[str]
     model_specs: List["LLMSpecV1"]
-    prompt_style: Optional["PromptStyleV1"]
+    chat_template: Optional[str]
+    stop_token_ids: Optional[List[int]]
+    stop: Optional[List[str]]
 
 
 class CustomLLMFamilyV1(LLMFamilyV1):
-    prompt_style: Optional[Union["PromptStyleV1", str]]  # type: ignore
-
     @classmethod
     def parse_raw(
         cls: Any,
@@ -176,6 +166,11 @@ def parse_raw(
         except (ValueError, TypeError, UnicodeDecodeError) as e:
             raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
         llm_spec: CustomLLMFamilyV1 = cls.parse_obj(obj)
+        vision_model_names: Set[str] = {
+            family.model_name
+            for family in BUILTIN_LLM_FAMILIES
+            if "vision" in family.model_ability
+        }
 
         # check model_family
         if llm_spec.model_family is None:
@@ -183,61 +178,45 @@ def parse_raw(
                 f"You must specify `model_family` when registering custom LLM models."
             )
         assert isinstance(llm_spec.model_family, str)
+        # TODO: Currently, tool call and vision models cannot be registered if it is not the builtin model_family
         if (
-            llm_spec.model_family != "other"
-            and "chat" in llm_spec.model_ability
-            and llm_spec.model_family not in BUILTIN_LLM_MODEL_CHAT_FAMILIES
-        ):
-            raise ValueError(
-                f"`model_family` for chat model must be `other` or one of the following values: \n"
-                f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
-            )
-        if (
-            llm_spec.model_family != "other"
-            and "tools" in llm_spec.model_ability
+            "tools" in llm_spec.model_ability
             and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES
         ):
             raise ValueError(
-                f"`model_family` for tool call model must be `other` or one of the following values: \n"
+                f"`model_family` for tool call model must be one of the following values: \n"
                 f"{', '.join(list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES))}"
             )
         if (
-            llm_spec.model_family != "other"
-            and "chat" not in llm_spec.model_ability
-            and llm_spec.model_family not in BUILTIN_LLM_MODEL_GENERATE_FAMILIES
+            "vision" in llm_spec.model_ability
+            and llm_spec.model_family not in vision_model_names
         ):
             raise ValueError(
-                f"`model_family` for generate model must be `other` or one of the following values: \n"
-                f"{', '.join(list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES))}"
+                f"`model_family` for multimodal model must be one of the following values: \n"
+                f"{', '.join(list(vision_model_names))}"
             )
-        # set prompt style when it is the builtin model family
+        # set chat_template when it is the builtin model family
+        if llm_spec.chat_template is None and "chat" in llm_spec.model_ability:
+            llm_spec.chat_template = llm_spec.model_family
+
+        # handle chat_template when user choose existing model_family
         if (
-            llm_spec.prompt_style is None
-            and llm_spec.model_family != "other"
-            and "chat" in llm_spec.model_ability
+            llm_spec.chat_template is not None
+            and llm_spec.chat_template in BUILTIN_LLM_PROMPT_STYLE
         ):
-            llm_spec.prompt_style = llm_spec.model_family
-
-        # handle prompt style when user choose existing style
-        if llm_spec.prompt_style is not None and isinstance(llm_spec.prompt_style, str):
-            prompt_style_name = llm_spec.prompt_style
-            if prompt_style_name not in BUILTIN_LLM_PROMPT_STYLE:
-                raise ValueError(
-                    f"Xinference does not support the prompt style name: {prompt_style_name}"
-                )
-            llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name]
+            llm_spec.stop_token_ids = BUILTIN_LLM_PROMPT_STYLE[llm_spec.chat_template][
+                "stop_token_ids"
+            ]
+            llm_spec.stop = BUILTIN_LLM_PROMPT_STYLE[llm_spec.chat_template]["stop"]
+            llm_spec.chat_template = BUILTIN_LLM_PROMPT_STYLE[llm_spec.chat_template][
+                "chat_template"
+            ]
 
         # check model ability, registering LLM only provides generate and chat
         # but for vision models, we add back the abilities so that
         # gradio chat interface can be generated properly
         if (
-            llm_spec.model_family != "other"
-            and llm_spec.model_family
-            in {
-                family.model_name
-                for family in BUILTIN_LLM_FAMILIES
-                if "vision" in family.model_ability
-            }
+            llm_spec.model_family in vision_model_names
             and "vision" not in llm_spec.model_ability
         ):
             llm_spec.model_ability.append("vision")
diff --git a/xinference/model/llm/llm_family_csghub.json b/xinference/model/llm/llm_family_csghub.json
index dc5b9d3ba8..d607b580b7 100644
--- a/xinference/model/llm/llm_family_csghub.json
+++ b/xinference/model/llm/llm_family_csghub.json
@@ -43,25 +43,17 @@
         "model_hub": "csghub"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -85,21 +77,12 @@
         "model_hub": "csghub"
       }
     ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant's response.</s>\n",
-      "roles": [
-        "<|user|>\n",
-        "<|assistant|>\n"
-      ],
-      "intra_message_sep": "</s>\n",
-      "inter_message_sep": "</s>\n",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% elif loop.first %}{{ '<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant\\'s response.</s>\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] + '</s>\n' }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] + '</s>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 44ac3e7794..49e3bdabe3 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -70,19 +70,11 @@
         "model_revision": "v1.0.1"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "<s>[INST] <<SYS>>\nYou are a helpful AI assistant.\n<</SYS>>\n\n",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": " </s><s>",
-      "stop_token_ids": [
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
         2
-      ]
-    }
+    ],
+    "stop": []
   },
   {
     "version": 1,
@@ -175,24 +167,15 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -367,24 +350,15 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{{- '<|begin_of_text|>' }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -449,20 +423,12 @@
         "model_revision": "v1.0.3"
       }
     ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "",
-      "roles": [
-        "<reserved_106>",
-        "<reserved_107>"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "</s>",
-      "stop_token_ids": [
-        2,
-        195
-      ]
-    }
+    "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}\n\n{% for message in messages %}\n{% if message['role'] == 'user' %}\n<reserved_106>\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% elif message['role'] == 'assistant' %}\n<reserved_107>\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n<reserved_107>\n{% endif %}",
+    "stop_token_ids": [
+      2,
+      195
+    ],
+    "stop": []
   },
   {
     "version": 1,
@@ -503,139 +469,6 @@
       }
     ]
   },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "chatglm3",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat",
-      "tools"
-    ],
-    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "ZhipuAI/chatglm3-6b",
-        "model_revision": "v1.0.2"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        64795,
-        64797,
-        2
-      ],
-      "stop": [
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 32768,
-    "model_name": "chatglm3-32k",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "ZhipuAI/chatglm3-6b-32k",
-        "model_revision": "master"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        64795,
-        64797,
-        2
-      ],
-      "stop": [
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 131072,
-    "model_name": "chatglm3-128k",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "ZhipuAI/chatglm3-6b-128k",
-        "model_revision": "master"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        64795,
-        64797,
-        2
-      ],
-      "stop": [
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
-  },
   {
     "version": 1,
     "context_length": 131072,
@@ -690,24 +523,17 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时，该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出，或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中，`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用，这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数：\n`search(query: str, recency_days: int)`：使用搜索引擎进行查询并显示结果，可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`：获取一系列指定 id 的页面内容。每次调用时，须选择3-10个页面。选择多个角度的页面，同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的，你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`：打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤：1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL，也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息，也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述，你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述，规则：\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求，需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明，所在地为中国，持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -763,24 +589,17 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时，该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出，或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中，`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用，这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数：\n`search(query: str, recency_days: int)`：使用搜索引擎进行查询并显示结果，可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`：获取一系列指定 id 的页面内容。每次调用时，须选择3-10个页面。选择多个角度的页面，同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的，你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`：打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤：1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL，也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息，也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述，你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述，规则：\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求，需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明，所在地为中国，持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -809,24 +628,17 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -869,24 +681,17 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATGLM3",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151329,
-        151336,
-        151338
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|user|>",
-        "<|observation|>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|>\n' + item['content'] }}{% elif loop.first %}{{ '<|system|>\n你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   },
   {
     "version": 1,
@@ -926,14 +731,13 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "XVERSE",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|> \n' + item['content'] }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|> \n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|> \n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}",
+    "stop_token_ids": [
+      3
+    ],
+    "stop": [
+      "<|endoftext|>"
+    ]
   },
   {
     "version": 1,
@@ -1045,23 +849,15 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "INTERNLM2",
-      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "stop_token_ids": [
-        2,
-        92542
-      ],
-      "stop": [
-        "</s>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -1086,23 +882,15 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "INTERNLM2",
-      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "stop_token_ids": [
-        2,
-        92542
-      ],
-      "stop": [
-        "</s>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -1140,18 +928,13 @@
         "model_revision": "v1.0.0"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE",
-      "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-      "roles": [
-        "Instruction",
-        "Response"
-      ],
-      "intra_message_sep": "\n\n### ",
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -1252,24 +1035,15 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "CodeShell",
-      "system_prompt": "",
-      "roles": [
-        "## human:",
-        "## assistant: "
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        70000
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "|||",
-        "|<end>|"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if item['role'] == 'user' %}{{ '## human: ' + item['content'] + '|<end>|' }}{% elif item['role'] == 'assistant' %}{{ '## assistant: ' + item['content'] + '|<end>|' }}{% endif %}{% endfor %}{{ '## assistant: ' }}",
+    "stop_token_ids": [
+      70000
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "|||",
+      "|<end>|"
+    ]
   },
   {
     "version": 1,
@@ -1353,19 +1127,13 @@
         "model_revision": "v0.1.0"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "<s>[INST] <<SYS>>\nWrite code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n<</SYS>>\n\n",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": " </s><s>",
-      "stop_token_ids": [
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
         2
-      ]
-    }
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -1567,16 +1335,13 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "MIXTRAL_V01",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": ""
-    }
+    "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + '</s>'}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -1716,28 +1481,19 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -1900,28 +1656,19 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -1961,28 +1708,19 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -2009,15 +1747,13 @@
         "model_revision": "v1.0.0"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE_COT",
-      "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-      "roles": [
-        "Instruction",
-        "Response"
-      ],
-      "intra_message_sep": "\n\n### "
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -2044,22 +1780,13 @@
         "model_revision": "v1.0.0"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "[INST] ",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": "<s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + '</s>'}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -2095,22 +1822,13 @@
         "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA2",
-      "system_prompt": "[INST] ",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": " ",
-      "inter_message_sep": "<s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + '</s>'}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -2151,22 +1869,13 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "mistral-nemo",
-      "system_prompt": "",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "</s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + '</s>' }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + '</s>'}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -2208,106 +1917,13 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "mistral-nemo",
-      "system_prompt": "",
-      "roles": [
-        "[INST]",
-        "[/INST]"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "</s>",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "zephyr-7b-alpha",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Zephyr-7B-α is the first model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "keepitsimple/zephyr-7b-alpha",
-        "model_revision": "v1.0-1"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "<|system|>\nYou are a friendly chatbot.</s>\n",
-      "roles": [
-        "<|user|>\n",
-        "<|assistant|>\n"
-      ],
-      "intra_message_sep": "</s>\n",
-      "inter_message_sep": "</s>\n",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "zephyr-7b-beta",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "chat"
+    "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- '<s>' }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + '</s>' }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + '</s>'}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+    "stop_token_ids": [
+      2
     ],
-    "model_description": "Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "modelscope/zephyr-7b-beta",
-        "model_revision": "master"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "<|system|>\nYou are a friendly chatbot.</s>\n",
-      "roles": [
-        "<|user|>\n",
-        "<|assistant|>\n"
-      ],
-      "intra_message_sep": "</s>\n",
-      "inter_message_sep": "</s>\n",
-      "stop_token_ids": [
-        2
-      ],
-      "stop": [
-        "</s>"
-      ]
-    }
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -2318,8 +1934,7 @@
       "zh"
     ],
     "model_ability": [
-      "chat",
-      "tools"
+      "chat"
     ],
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
@@ -2438,25 +2053,17 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|im_start|>system\n' + item['content'] + '<|im_end|>\n' }}{% elif loop.first %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|im_start|>user\n' + item['content'] + '<|im_end|>' }}{% elif item['role'] == 'assistant' %}{{ '<|im_start|>assistant\n' + item['content'] + '<|im_end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2832,25 +2439,17 @@
         }
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2887,25 +2486,17 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -2984,25 +2575,17 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -3281,25 +2864,17 @@
         }
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -3365,25 +2940,17 @@
         }
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{%- macro json_to_python_type(json_spec) %}\n    {%- set basic_type_map = {\n    \"string\": \"str\",\n    \"number\": \"float\",\n    \"integer\": \"int\",\n    \"boolean\": \"bool\"\n} %}\n    {%- if basic_type_map[json_spec.type] is defined %}\n        {{- basic_type_map[json_spec.type] }}\n    {%- elif json_spec.type == \"array\" %}\n        {{- \"list[\" +  json_to_python_type(json_spec|items) + \"]\" }}\n    {%- elif json_spec.type == \"object\" %}\n        {%- if json_spec.additionalProperties is defined %}\n            {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n        {%- else %}\n            {{- \"dict\" }}\n        {%- endif %}\n    {%- elif json_spec.type is iterable %}\n        {{- \"Union[\" }}\n        {%- for t in json_spec.type %}\n            {{- json_to_python_type({\"type\": t}) }}\n            {%- if not loop.last %}\n                {{- \",\" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \"]\" }}\n    {%- else %}\n        {{- \"Any\" }}\n    {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n    {{- '<|im_start|>system\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] + '\n\n' }}\n    {%- endif %}\n    {{- '# Tools\n\n' }}\n    {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- '{\"type\": \"function\", \"function\": ' }}\n        {{- '{\"name\": ' + tool.name + '\", ' }}\n        {{- '\"description\": \"' + tool.name + '(' }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {{- param_name + \": \" + json_to_python_type(param_fields) }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- endif %}\n        {%- endfor %}\n        {{- \")\" }}\n        {%- if tool.return is defined %}\n            {{- \" -> \" + json_to_python_type(tool.return) }}\n        {%- endif %}\n        {{- \" - \" + tool.description + \"\n\n\" }}\n        {%- for param_name, param_fields in tool.parameters.properties|items %}\n            {%- if loop.first %}\n                {{- \"    Args:\n\" }}\n            {%- endif %}\n            {{- \"        \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n        {%- endfor %}\n        {%- if tool.return is defined and tool.return.description is defined %}\n            {{- \"\n    Returns:\n        \" + tool.return.description }}\n        {%- endif %}\n        {{- '\"' }}\n        {{- ', \"parameters\": ' }}\n        {%- if tool.parameters.properties | length == 0 %}\n            {{- \"{}\" }}\n        {%- else %}\n            {{- tool.parameters|tojson }}\n        {%- endif %}\n        {{- \"}\" }}\n        {%- if not loop.last %}\n            {{- \"\n\" }}\n        {%- endif %}\n    {%- endfor %}\n    {{- \" </tools>\" }}\n    {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n    {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n    {{- \"<tool_call>\n\" }}\n    {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n    {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n    {%- if messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '{' }}\n            {{- '\"name\": \"' }}\n            {{- tool_call.name }}\n            {%- if tool_call.arguments is defined %}\n                {{- ', ' }}\n                {{- '\"arguments\": ' }}\n                {{- tool_call.arguments|tojson }}\n            {%- endif %}\n            {{- '\"}' }}\n            {{- '\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if not message.name is defined %}\n            {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n        {%- endif %}\n        {{- '<|im_start|>user\n<tool_response>\n' }}\n        {{- '{\"name\": \"' }}\n        {{- message.name }}\n        {{- '\", \"content\": ' }}\n        {{- message.content|tojson + '}' }}\n        {{- '\n</tool_response><|im_end|>\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -3418,19 +2985,13 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "DEEPSEEK_CHAT",
-      "system_prompt": "<｜begin▁of▁sentence｜>",
-      "roles": [
-        "User",
-        "Assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<｜end▁of▁sentence｜>",
-      "stop": [
-        "<｜end▁of▁sentence｜>"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   },
   {
     "version": 1,
@@ -3505,19 +3066,13 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "DEEPSEEK_CHAT",
-      "system_prompt": "<｜begin▁of▁sentence｜>",
-      "roles": [
-        "User",
-        "Assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<｜end▁of▁sentence｜>",
-      "stop": [
-        "<｜end▁of▁sentence｜>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<｜begin▁of▁sentence｜>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<｜end▁of▁sentence｜>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   },
   {
     "version": 1,
@@ -3614,18 +3169,13 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "DEEPSEEK_CODER",
-      "system_prompt": "You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.",
-      "roles": [
-        "### Instruction:",
-        "### Response:"
-      ],
-      "inter_message_sep": "\n",
-      "stop": [
-        "<|EOT|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{'<｜begin▁of▁sentence｜>'}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\n' + message['content'] + '\n'}}\n        {%- else %}\n{{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+    "stop_token_ids": [
+      32021
+    ],
+    "stop": [
+      "<|EOT|>"
+    ]
   },
   {
     "version": 1,
@@ -3713,23 +3263,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "INTERNLM2",
-      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "stop_token_ids": [
-        2,
-        92542
-      ],
-      "stop": [
-        "</s>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -3766,24 +3308,17 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        151643,
-        151644,
-        151645
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -3819,18 +3354,17 @@
         "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
       }
     ],
-    "prompt_style": {
-      "style_name": "orion",
-      "roles": [
-        "Human",
-        "assistant"
-      ],
-      "stop": [
-        "<s>",
-        "</s>",
-        "<unk>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first %}{{ '<s>' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '</s>' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2,
+      0
+    ],
+    "stop": [
+      "<s>",
+      "</s>",
+      "<unk>"
+    ]
   },
   {
     "version": 1,
@@ -3857,18 +3391,17 @@
         "model_id": "OrionStarAI/Orion-14B-Chat-RAG"
       }
     ],
-    "prompt_style": {
-      "style_name": "orion",
-      "roles": [
-        "Human",
-        "assistant"
-      ],
-      "stop": [
-        "<s>",
-        "</s>",
-        "<unk>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first %}{{ '<s>' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '</s>' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '</s>' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2,
+      0
+    ],
+    "stop": [
+      "<s>",
+      "</s>",
+      "<unk>"
+    ]
   },
   {
     "version": 1,
@@ -3903,28 +3436,19 @@
         "model_id": "01ai/Yi-VL-34B"
       }
     ],
-    "prompt_style": {
-      "style_name": "CHATML",
-      "system_prompt": "",
-      "roles": [
-        "<|im_start|>user",
-        "<|im_start|>assistant"
-      ],
-      "intra_message_sep": "<|im_end|>",
-      "inter_message_sep": "",
-      "stop_token_ids": [
-        2,
-        6,
-        7,
-        8
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|im_start|>",
-        "<|im_end|>",
-        "<|im_sep|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      6,
+      7,
+      8
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>",
+      "<|im_sep|>"
+    ]
   },
   {
     "version": 1,
@@ -3961,17 +3485,17 @@
         "model_id": "AI-ModelScope/gemma-7b-it"
       }
     ],
-    "prompt_style": {
-      "style_name": "gemma",
-      "roles": [
-        "user",
-        "model"
-      ],
-      "stop": [
-        "<end_of_turn>",
-        "<start_of_turn>"
-      ]
-    }
+    "chat_template": "{{ '<bos>' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
+    "stop_token_ids": [
+      1,
+      106,
+      107
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
   },
   {
     "version": 1,
@@ -4042,17 +3566,17 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "gemma",
-      "roles": [
-        "user",
-        "model"
-      ],
-      "stop": [
-        "<end_of_turn>",
-        "<start_of_turn>"
-      ]
-    }
+    "chat_template": "{{ '<bos>' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
+    "stop_token_ids": [
+      1,
+      106,
+      107
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
   },
   {
     "version":1,
@@ -4089,14 +3613,13 @@
         "model_revision":"master"
       }
     ],
-    "prompt_style":{
-      "style_name":"OmniLMM",
-      "system_prompt":"The role of first msg should be user",
-      "roles":[
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4121,22 +3644,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4161,22 +3677,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4201,22 +3710,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4241,22 +3743,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4281,22 +3776,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "MINICPM-2B",
-      "system_prompt": "",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "stop_token_ids": [
-        1,
-        2
-      ],
-      "stop": [
-        "<s>",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2
+    ],
+    "stop": [
+      "<s>",
+      "</s>"
+    ]
   },
   {
     "version":1,
@@ -4333,14 +3821,13 @@
         "model_revision":"master"
       }
     ],
-    "prompt_style":{
-      "style_name":"OmniLMM",
-      "system_prompt":"The role of first msg should be user",
-      "roles":[
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+    "stop_token_ids": [
+      128001
+    ],
+    "stop": [
+      "<|end_of_text|>"
+    ]
   },
   {
     "version":1,
@@ -4377,14 +3864,15 @@
         "model_revision":"master"
       }
     ],
-    "prompt_style":{
-      "style_name":"QWEN",
-      "system_prompt":"You are a helpful assistant",
-      "roles":[
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
   },
   {
     "version": 1,
@@ -4463,23 +3951,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE",
-      "intra_message_sep": "\n",
-      "system_prompt": "",
-      "roles": [
-        "USER",
-        "ASSISTANT"
-      ],
-      "stop_token_ids": [
-        100006,
-        100007
-      ],
-      "stop": [
-        "[CLS]",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}",
+    "stop_token_ids": [
+      100006,
+      100007
+    ],
+    "stop": [
+      "[CLS]",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4504,23 +3984,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "ADD_COLON_SINGLE",
-      "intra_message_sep": "\n",
-      "system_prompt": "",
-      "roles": [
-        "USER",
-        "ASSISTANT"
-      ],
-      "stop_token_ids": [
-        100006,
-        100007
-      ],
-      "stop": [
-        "[CLS]",
-        "</s>"
-      ]
-    }
+    "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}",
+    "stop_token_ids": [
+      100006,
+      100007
+    ],
+    "stop": [
+      "[CLS]",
+      "</s>"
+    ]
   },
   {
     "version": 1,
@@ -4588,20 +4060,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "c4ai-command-r",
-      "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
-      "roles": [
-        "<|USER_TOKEN|>",
-        "<|CHATBOT_TOKEN|>"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
-      "stop_token_ids": [
-        6,
-        255001
-      ]
-    }
+    "chat_template": "{{ '<BOS_TOKEN>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
+    "stop_token_ids": [
+      6,
+      255001
+    ],
+    "stop": [
+      "<EOS_TOKEN>",
+      "<|END_OF_TURN_TOKEN|>"
+    ]
   },
   {
     "version": 1,
@@ -4628,24 +4095,17 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "PHI3",
-      "system_prompt": "You are a helpful AI assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "inter_message_sep": "<|end|>\n",
-      "stop_token_ids":[
-        32000,
-        32007
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|end|>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}",
+    "stop_token_ids":[
+      32000,
+      32001,
+      32007
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|assistant|>",
+      "<|end|>"
+    ]
   },
   {
     "version": 1,
@@ -4672,24 +4132,17 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "PHI3",
-      "system_prompt": "You are a helpful AI assistant.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n",
-      "inter_message_sep": "<|end|>\n",
-      "stop_token_ids":[
-        32000,
-        32007
-      ],
-      "stop": [
-        "<|endoftext|>",
-        "<|end|>"
-      ]
-    }
+    "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}",
+    "stop_token_ids":[
+      32000,
+      32001,
+      32007
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|assistant|>",
+      "<|end|>"
+    ]
   },
   {
     "version": 1,
@@ -4718,25 +4171,17 @@
             "model_revision": "master"
         }
     ],
-    "prompt_style": {
-        "style_name": "INTERNVL",
-        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-        "roles": [
-            "<|im_start|>user",
-            "<|im_start|>assistant"
-        ],
-        "intra_message_sep": "<|im_end|>",
-        "stop_token_ids": [
-            2,
-            92543,
-            92542
-        ],
-        "stop": [
-            "</s>",
-            "<|im_end|>",
-            "<|im_start|>"
-        ]
-    }
+    "chat_template": "{{ '<s>' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      92542,
+      92543
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>",
+      "<|im_start|>"
+    ]
   },
   {
     "version": 1,
@@ -4888,25 +4333,17 @@
             "model_revision": "master"
         }
     ],
-    "prompt_style": {
-        "style_name": "INTERNVL",
-        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-        "roles": [
-            "<|im_start|>user",
-            "<|im_start|>assistant"
-        ],
-        "intra_message_sep": "<|im_end|>",
-        "stop_token_ids": [
-            2,
-            92543,
-            92542
-        ],
-        "stop": [
-            "</s>",
-            "<|im_end|>",
-            "<|im_start|>"
-        ]
-    }
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   },
   {
     "version": 1,
@@ -4943,24 +4380,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -4989,24 +4417,15 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "LLAMA3",
-      "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
-      "roles": [
-        "user",
-        "assistant"
-      ],
-      "intra_message_sep": "\n\n",
-      "inter_message_sep": "<|eot_id|>",
-      "stop_token_ids": [
-        128001,
-        128009
-      ],
-      "stop": [
-        "<|end_of_text|>",
-        "<|eot_id|>"
-      ]
-    }
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}",
+    "stop_token_ids": [
+      128001,
+      128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+      "<|eot_id|>"
+    ]
   },
   {
     "version": 1,
@@ -5080,23 +4499,14 @@
         "model_revision": "master"
       }
     ],
-    "prompt_style": {
-      "style_name": "NO_COLON_TWO",
-      "system_prompt": "You are a helpful assistant.",
-      "roles": [
-        "<_user>",
-        "<_bot>"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": "",
-      "stop": [
-        "<_end>",
-        "<_start>"
-      ],
-      "stop_token_ids": [
-        160133,
-        160132
-      ]
-    }
+    "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{%- for message in messages -%}{%- if message['role'] == 'user' -%}{{- '<_user>' + message['content'] +'<_bot>' -}}{%- elif message['role'] == 'assistant' -%}{{- message['content'] + '<_end>' -}}{%- endif -%}{%- endfor -%}",
+    "stop": [
+      "<_end>",
+      "<_start>"
+    ],
+    "stop_token_ids": [
+      160133,
+      160132
+    ]
   }
 ]
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index 22fbd53e72..8df9207a95 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -12,25 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
 import uuid
 from typing import AsyncGenerator, Dict, Iterator, List, Optional, TypedDict, Union
 
 import torch
 
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionChunkChoice,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionUsage,
-    LoRA,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import ChatModelMixin
+from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
 
 logger = logging.getLogger(__name__)
 
@@ -74,8 +64,8 @@ class LMDeployGenerateConfig(TypedDict, total=False):
     repetition_penalty: Optional[float]
     ignore_eos: Optional[bool]
     random_seed: Optional[int]
-    stop_words: Optional[List[str]]
-    bad_words: Optional[List[str]]
+    stop_words: Optional[List[int]]
+    bad_words: Optional[List[int]]
     min_new_tokens: Optional[int]
     skip_special_tokens: Optional[bool]
     logprobs: Optional[int]
@@ -164,9 +154,6 @@ def load(self):
             raise ValueError(f"Can not find correct chat template.")
 
         chat_template_config = ChatTemplateConfig(chat_temp_name)
-        chat_template_config.meta_instruction = (
-            self.model_family.prompt_style.system_prompt
-        )
         count = torch.cuda.device_count()
         if count > 1:
             self._model_config.setdefault("tp", torch.cuda.device_count())
@@ -192,9 +179,7 @@ def match(
 
     async def async_chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         stream = (
@@ -213,75 +198,69 @@ async def async_chat(
             else False
         )
 
-        chat_history = chat_history or []
-
         if stream:
-            chunk = self._chat_stream(prompt, chat_history, include_usage)
+            chunk = self._chat_stream(messages, include_usage)
             return self._async_to_chat_completion_chunks(chunk)
         else:
-            chunk = await self._chat(prompt, chat_history)
-            return self._to_chat_completion(chunk)
+            return await self._chat(messages)
 
-    async def _chat_stream(self, prompt, chat_history, include_usage):
+    async def _chat_stream(self, messages, include_usage):
         from lmdeploy.messages import Response
 
         prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
         completion_id = str(uuid.uuid1())
+        finish_reason = None
         async for output in self._generate(
-            prompt,
-            chat_history,
+            messages,
             session_id=-1,
             stream_response=True,
         ):
             new_text = output.text if isinstance(output, Response) else output.response
-
-            completion_choice = ChatCompletionChunkChoice(
-                text=new_text,
-                index=0,
-                logprobs=None,
-                finish_reason=output.finish_reason,
-            )
-            chunk = ChatCompletionChunk(
-                id=completion_id,
-                object="chat.completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[completion_choice],
-            )
             prompt_tokens = output.input_token_len
             completion_tokens = output.generate_token_len
             total_tokens = prompt_tokens + completion_tokens
-            completion_usage = CompletionUsage(
+            finish_reason = output.finish_reason
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
             )
-            chunk["usage"] = completion_usage
-            print(chunk)
-            yield chunk
+
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason=finish_reason,
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            has_choice=True,
+            has_content=False,
+        )
         if include_usage:
-            chunk = ChatCompletionChunk(
-                id=completion_id,
-                object="chat.completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[],
-            )
-            chunk["usage"] = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                has_choice=False,
+                has_content=False,
             )
-            yield chunk
 
-    async def _chat(self, prompt, chat_history):
+    async def _chat(self, messages) -> ChatCompletion:
         from lmdeploy.messages import Response
 
-        response, finish_reason = "", ""
+        response, finish_reason = "", None
         prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
         async for output in self._generate(
-            prompt,
-            chat_history,
+            messages,
             session_id=-1,
             stream_response=False,
         ):
@@ -291,30 +270,20 @@ async def _chat(self, prompt, chat_history):
             total_tokens = output.input_token_len + output.generate_token_len
             finish_reason = output.finish_reason
 
-        chunk = ChatCompletion(
-            id=str(uuid.uuid1()),
-            object="chat.completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[
-                CompletionChoice(
-                    index=0, text=response, finish_reason=finish_reason, logprobs=None
-                )
-            ],
-            usage=CompletionUsage(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=total_tokens,
-            ),
+        return generate_chat_completion(
+            self.model_uid,
+            response,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            finish_reason=finish_reason,
         )
-        return chunk
 
     # copy from lmdeploy
     # Reference: lmdeploy.serve.async_engine.py
     async def _generate(
         self,
-        prompt,
-        chat_history,
+        messages: List[Dict],
         session_id: int,
         generate_config: Optional[Dict] = None,
         tools: Optional[List[object]] = None,
@@ -332,6 +301,8 @@ async def _generate(
         from lmdeploy.serve.async_engine import GenOut
         from lmdeploy.tokenizer import DetokenizeState
 
+        from ..utils import get_stop_token_ids_from_config_file
+
         session_id = -1
 
         if str(session_id) not in self._model.id2step:
@@ -343,7 +314,9 @@ async def _generate(
                 generate_config, self._model.tokenizer
             )
         if generate_config.stop_words is None:  # type: ignore
-            generate_config.stop_words = self._model.stop_words  # type: ignore
+            stop_token_ids = get_stop_token_ids_from_config_file(self.model_path)
+            if stop_token_ids is not None:
+                generate_config.stop_words = stop_token_ids  # type: ignore
         if generate_config.random_seed is None and sequence_start:  # type: ignore
             generate_config.random_seed = random.getrandbits(64)  # type: ignore
         if generate_config.n > 1:  # type: ignore
@@ -353,7 +326,7 @@ async def _generate(
             )
             generate_config.n = 1  # type: ignore
 
-        prompt_input = await self._get_prompt_input(prompt, chat_history)
+        prompt_input = await self._get_prompt_input(messages)
         prompt = prompt_input["prompt"]
         input_ids = prompt_input["input_ids"]
         finish_reason = None
@@ -482,8 +455,7 @@ async def _generate(
     # Reference: lmdeploy.serve.vl_async_engine.py
     async def _get_prompt_input(
         self,
-        prompt: Union[str, List[Dict]],
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         sequence_start: bool = True,
         tools: Optional[List[object]] = None,
         **kwargs,
@@ -493,13 +465,9 @@ async def _get_prompt_input(
         IMAGE_DUMMY_TOKEN_INDEX = 0
         import numpy as np
 
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        chat_history = chat_history or []
-
-        decorated, _ = self.get_prompt(prompt, chat_history, prompt_style)  # type: ignore
-        chat_history.append(ChatCompletionMessage(role="user", content=prompt))  # type: ignore
-        prompt = chat_history  # type: ignore
+        model_family = self.model_family.model_family or self.model_family.model_name
+        decorated, _ = self.get_specific_prompt(model_family, messages)  # type: ignore
+        prompt = messages  # type: ignore
 
         decorated = decorated.replace("<image>", "<img><IMAGE_TOKEN></img>")
 
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index e41db2b693..07966fcbba 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -17,13 +17,12 @@
 import sys
 import time
 import uuid
-from typing import Dict, Iterable, Iterator, List, Optional, TypedDict, Union
+from typing import Dict, Iterator, List, Optional, TypedDict, Union
 
 from ....fields import max_tokens_field
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChoice,
     CompletionChunk,
@@ -32,7 +31,7 @@
 )
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import ChatModelMixin
+from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin, generate_completion_chunk
 
 logger = logging.getLogger(__name__)
 
@@ -54,6 +53,7 @@ class MLXGenerateConfig(TypedDict, total=False):
     stop_token_ids: Optional[Union[int, List[int]]]
     stream: bool
     stream_options: Optional[Union[dict, None]]
+    tools: Optional[List[Dict]]
 
 
 class MLXModel(LLM):
@@ -238,29 +238,35 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
         else:
             finish_reason = "stop"
 
-        if stream:
-            completion_choice = CompletionChoice(
-                text="", index=0, logprobs=None, finish_reason=finish_reason
-            )
-        else:
-            completion_choice = CompletionChoice(
-                text=output, index=0, logprobs=None, finish_reason=finish_reason
-            )
-
-        completion_chunk = CompletionChunk(
-            id=chunk_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=model_uid,
-            choices=[completion_choice],
-        )
         completion_usage = CompletionUsage(
             prompt_tokens=input_echo_len,
             completion_tokens=i,
             total_tokens=(input_echo_len + i),
         )
-
-        yield completion_chunk, completion_usage
+        if stream:
+            yield generate_completion_chunk(
+                None,
+                finish_reason=finish_reason,
+                chunk_id=chunk_id,
+                model_uid=model_uid,
+                prompt_tokens=input_echo_len,
+                completion_tokens=i,
+                total_tokens=(input_echo_len + i),
+                has_choice=True,
+                has_content=False,
+            ), completion_usage
+        else:
+            yield generate_completion_chunk(
+                output,
+                finish_reason=finish_reason,
+                chunk_id=chunk_id,
+                model_uid=model_uid,
+                prompt_tokens=input_echo_len,
+                completion_tokens=i,
+                total_tokens=(input_echo_len + i),
+                has_choice=True,
+                has_content=True,
+            ), completion_usage
 
         if include_usage:
             completion_chunk = CompletionChunk(
@@ -270,11 +276,6 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
                 model=model_uid,
                 choices=[],
             )
-            completion_usage = CompletionUsage(
-                prompt_tokens=input_echo_len,
-                completion_tokens=i,
-                total_tokens=(input_echo_len + i),
-            )
             yield completion_chunk, completion_usage
 
     def generate(
@@ -345,20 +346,13 @@ def _sanitize_generate_config(
         generate_config: Optional[MLXGenerateConfig],
     ) -> MLXGenerateConfig:
         generate_config = super()._sanitize_generate_config(generate_config)
-        if (
-            (not generate_config.get("stop"))
-            and self.model_family.prompt_style
-            and self.model_family.prompt_style.stop
-        ):
-            generate_config["stop"] = self.model_family.prompt_style.stop.copy()
+        if (not generate_config.get("stop")) and self.model_family.stop:
+            generate_config["stop"] = self.model_family.stop.copy()
         if (
             generate_config.get("stop_token_ids", None) is None
-            and self.model_family.prompt_style
-            and self.model_family.prompt_style.stop_token_ids
+            and self.model_family.stop_token_ids
         ):
-            generate_config[
-                "stop_token_ids"
-            ] = self.model_family.prompt_style.stop_token_ids.copy()
+            generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
 
         return generate_config
 
@@ -377,28 +371,19 @@ def match(
 
     def chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[MLXGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        tools = generate_config.pop("tools", []) if generate_config else None  # type: ignore
-        full_prompt = self.get_full_prompt(
-            self.model_family, prompt, system_prompt, chat_history, tools
+        model_family = self.model_family.model_family or self.model_family.model_name
+        tools = generate_config.pop("tools", []) if generate_config else None
+        full_context_kwargs = {}
+        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
+            full_context_kwargs["tools"] = tools
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
         )
 
         generate_config = self._sanitize_generate_config(generate_config)
-        # TODO(codingl2k1): qwen hacky to set stop for function call.
-        model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
-            stop = generate_config.get("stop")
-            if isinstance(stop, str):
-                generate_config["stop"] = [stop, "Observation:"]
-            elif isinstance(stop, Iterable):
-                assert not isinstance(stop, str)
-                generate_config["stop"] = list(stop) + ["Observation:"]
-            else:
-                generate_config["stop"] = "Observation:"
 
         stream = generate_config.get("stream", False)
         if stream:
@@ -409,7 +394,5 @@ def chat(
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, c, tools
-                )
+                return self._tool_calls_completion(self.model_family, self.model_uid, c)
             return self._to_chat_completion(c)
diff --git a/xinference/model/llm/mlx/tests/test_mlx.py b/xinference/model/llm/mlx/tests/test_mlx.py
index 4fe69fd34f..b1d0682e5b 100644
--- a/xinference/model/llm/mlx/tests/test_mlx.py
+++ b/xinference/model/llm/mlx/tests/test_mlx.py
@@ -36,6 +36,7 @@ def test_load_mlx(setup):
     )
     assert len(client.list_models()) == 1
     model = client.get_model(model_uid)
-    completion = model.chat("write a poem.")
+    messages = [{"role": "user", "content": "write a poem."}]
+    completion = model.chat(messages)
     assert "content" in completion["choices"][0]["message"]
     assert len(completion["choices"][0]["message"]["content"]) != 0
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 3c31b4fe7a..7d2566ee27 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -21,7 +21,6 @@
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChoice,
     CompletionChunk,
@@ -29,7 +28,7 @@
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
-from ..utils import ChatModelMixin
+from ..utils import ChatModelMixin, generate_completion_chunk
 
 logger = logging.getLogger(__name__)
 
@@ -346,12 +345,14 @@ async def async_generate(
 
             async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
                 prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+                finish_reason = None
                 async for meta_info, out in self._stream_generate(
                     prompt, **sanitized_generate_config
                 ):
                     chunk = self._convert_state_to_completion_chunk(
                         request_id, self.model_uid, output_text=out
                     )
+                    finish_reason = meta_info["finish_reason"]
                     prompt_tokens = meta_info["prompt_tokens"]
                     completion_tokens = meta_info["completion_tokens"]
                     total_tokens = prompt_tokens + completion_tokens
@@ -361,6 +362,28 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
                         total_tokens=total_tokens,
                     )
                     yield chunk
+
+                finish_reason = (
+                    "stop"
+                    if finish_reason is None
+                    or (
+                        isinstance(finish_reason, str)
+                        and finish_reason.lower() == "none"
+                    )
+                    else finish_reason
+                )
+                yield generate_completion_chunk(
+                    None,
+                    finish_reason=finish_reason,
+                    chunk_id=request_id,
+                    model_uid=self.model_uid,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                    has_choice=True,
+                    has_content=False,
+                )
+
                 if include_usage:
                     chunk = CompletionChunk(
                         id=request_id,
@@ -409,26 +432,17 @@ def _sanitize_chat_config(
     ) -> Dict:
         if not generate_config:
             generate_config = {}
-        if self.model_family.prompt_style:
-            if (
-                not generate_config.get("stop")
-            ) and self.model_family.prompt_style.stop:
-                generate_config["stop"] = self.model_family.prompt_style.stop.copy()
+        if self.model_family.stop:
+            if (not generate_config.get("stop")) and self.model_family.stop:
+                generate_config["stop"] = self.model_family.stop.copy()
         return generate_config
 
     async def async_chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-        chat_history = chat_history or []
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style)
+        full_prompt = self.get_full_context(messages, self.model_family.chat_template)
 
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
diff --git a/xinference/model/llm/tests/test_llm_family.py b/xinference/model/llm/tests/test_llm_family.py
index 252491282c..146f00dd6f 100644
--- a/xinference/model/llm/tests/test_llm_family.py
+++ b/xinference/model/llm/tests/test_llm_family.py
@@ -26,7 +26,6 @@
     CustomLLMFamilyV1,
     LlamaCppLLMSpecV1,
     LLMFamilyV1,
-    PromptStyleV1,
     PytorchLLMSpecV1,
     _generate_meta_file,
     _get_cache_dir,
@@ -70,15 +69,9 @@ def test_deserialize_llm_family_v1():
          "model_id":"example/TestModel"
       }
    ],
-   "prompt_style": {
-       "style_name": "ADD_COLON_SINGLE",
-       "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
-       "roles": ["user", "assistant"],
-       "intra_message_sep": "\\n### ",
-       "inter_message_sep": "\\n### ",
-       "stop": null,
-       "stop_token_ids": null
-   }
+   "chat_template": "xyz",
+   "stop_token_ids": [1, 2, 3],
+   "stop": ["hello", "world"]
 }"""
     model_family = LLMFamilyV1.parse_raw(serialized)
     assert isinstance(model_family, LLMFamilyV1)
@@ -108,17 +101,9 @@ def test_deserialize_llm_family_v1():
     assert pytorch_spec.model_hub == "huggingface"
     assert pytorch_spec.model_id == "example/TestModel"
 
-    prompt_style = PromptStyleV1(
-        style_name="ADD_COLON_SINGLE",
-        system_prompt=(
-            "A chat between a curious human and an artificial intelligence assistant. The "
-            "assistant gives helpful, detailed, and polite answers to the human's questions."
-        ),
-        roles=["user", "assistant"],
-        intra_message_sep="\n### ",
-        inter_message_sep="\n### ",
-    )
-    assert prompt_style == model_family.prompt_style
+    assert model_family.chat_template == "xyz"
+    assert model_family.stop_token_ids == [1, 2, 3]
+    assert model_family.stop == ["hello", "world"]
 
 
 def test_serialize_llm_family_v1():
@@ -139,16 +124,6 @@ def test_serialize_llm_family_v1():
         model_id="example/TestModel",
         model_revision="456",
     )
-    prompt_style = PromptStyleV1(
-        style_name="ADD_COLON_SINGLE",
-        system_prompt=(
-            "A chat between a curious human and an artificial intelligence assistant. The "
-            "assistant gives helpful, detailed, and polite answers to the human's questions."
-        ),
-        roles=["user", "assistant"],
-        intra_message_sep="\n### ",
-        inter_message_sep="\n### ",
-    )
     llm_family = LLMFamilyV1(
         version=1,
         model_type="LLM",
@@ -156,10 +131,12 @@ def test_serialize_llm_family_v1():
         model_lang=["en"],
         model_ability=["embed", "generate"],
         model_specs=[gguf_spec, pytorch_spec],
-        prompt_style=prompt_style,
+        chat_template="xyz",
+        stop_token_ids=[1, 2, 3],
+        stop=["hello", "world"],
     )
 
-    expected = """{"version": 1, "context_length": 2048, "model_name": "TestModel", "model_lang": ["en"], "model_ability": ["embed", "generate"], "model_description": null, "model_family": null, "model_specs": [{"model_format": "ggufv2", "model_hub": "huggingface", "model_size_in_billions": 2, "quantizations": ["q4_0", "q4_1"], "quantization_parts": {"q4_2": ["a", "b"]}, "model_id": "example/TestModel", "model_revision": "123", "model_file_name_template": "TestModel.{quantization}.bin", "model_file_name_split_template": "TestModel.{quantization}.bin.{part}", "model_uri": null}, {"model_format": "pytorch", "model_hub": "huggingface", "model_size_in_billions": 3, "quantizations": ["int8", "int4", "none"], "model_id": "example/TestModel", "model_revision": "456", "model_uri": null}], "prompt_style": {"style_name": "ADD_COLON_SINGLE", "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", "roles": ["user", "assistant"], "intra_message_sep": "\\n### ", "inter_message_sep": "\\n### ", "stop": null, "stop_token_ids": null}}"""
+    expected = """{"version": 1, "context_length": 2048, "model_name": "TestModel", "model_lang": ["en"], "model_ability": ["embed", "generate"], "model_description": null, "model_family": null, "model_specs": [{"model_format": "ggufv2", "model_hub": "huggingface", "model_size_in_billions": 2, "quantizations": ["q4_0", "q4_1"], "quantization_parts": {"q4_2": ["a", "b"]}, "model_id": "example/TestModel", "model_revision": "123", "model_file_name_template": "TestModel.{quantization}.bin", "model_file_name_split_template": "TestModel.{quantization}.bin.{part}", "model_uri": null}, {"model_format": "pytorch", "model_hub": "huggingface", "model_size_in_billions": 3, "quantizations": ["int8", "int4", "none"], "model_id": "example/TestModel", "model_revision": "456", "model_uri": null}], "chat_template": "xyz", "stop_token_ids": [1, 2, 3], "stop": ["hello", "world"]}"""
     assert json.loads(llm_family.json()) == json.loads(expected)
 
     llm_family_context_length = LLMFamilyV1(
@@ -170,7 +147,9 @@ def test_serialize_llm_family_v1():
         model_lang=["en"],
         model_ability=["embed", "generate"],
         model_specs=[gguf_spec, pytorch_spec],
-        prompt_style=prompt_style,
+        chat_template="xyz",
+        stop_token_ids=[1, 2, 3],
+        stop=["hello", "world"],
     )
 
     assert json.loads(llm_family_context_length.json()) == json.loads(expected)
@@ -201,7 +180,9 @@ def test_cache_from_huggingface_pytorch():
         model_lang=["en"],
         model_ability=["embed", "generate"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     cache_dir = cache_from_huggingface(family, spec, quantization=None)
@@ -230,7 +211,9 @@ def test_cache_from_huggingface_gguf():
         model_lang=["en"],
         model_ability=["chat"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     cache_dir = _get_cache_dir(family, spec)
@@ -266,7 +249,9 @@ def test_cache_from_uri_local():
         model_lang=["en"],
         model_ability=["embed", "chat"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     cache_dir = cache_from_uri(family, spec)
@@ -295,7 +280,9 @@ def test_meta_file():
         model_lang=["en"],
         model_ability=["embed", "generate"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     cache_dir = cache_from_huggingface(family, spec, quantization=None)
@@ -340,7 +327,9 @@ def test_legacy_cache():
         model_lang=["en"],
         model_ability=["chat"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     cache_path = get_legacy_cache_path(
@@ -378,7 +367,9 @@ def test_custom_llm():
         model_lang=["en"],
         model_ability=["chat"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     register_llm(family, False)
@@ -408,7 +399,9 @@ def test_persistent_custom_llm():
         model_lang=["en"],
         model_ability=["chat"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     register_llm(family, True)
@@ -501,16 +494,6 @@ def test_skip_download_pytorch():
         model_hub="modelscope",
         model_revision="456",
     )
-    prompt_style = PromptStyleV1(
-        style_name="ADD_COLON_SINGLE",
-        system_prompt=(
-            "A chat between a curious human and an artificial intelligence assistant. The "
-            "assistant gives helpful, detailed, and polite answers to the human's questions."
-        ),
-        roles=["user", "assistant"],
-        intra_message_sep="\n### ",
-        inter_message_sep="\n### ",
-    )
     llm_family = LLMFamilyV1(
         version=1,
         model_type="LLM",
@@ -518,7 +501,9 @@ def test_skip_download_pytorch():
         model_lang=["en"],
         model_ability=["embed", "generate"],
         model_specs=[hf_spec, ms_spec],
-        prompt_style=prompt_style,
+        chat_template="xyz",
+        stop_token_ids=[1, 2, 3],
+        stop=["hello", "world"],
     )
 
     cache_dir = _get_cache_dir(llm_family, hf_spec)
@@ -594,16 +579,6 @@ def test_skip_download_gguf():
         model_revision="123",
         model_file_name_template="TestModel.{quantization}.bin",
     )
-    prompt_style = PromptStyleV1(
-        style_name="ADD_COLON_SINGLE",
-        system_prompt=(
-            "A chat between a curious human and an artificial intelligence assistant. The "
-            "assistant gives helpful, detailed, and polite answers to the human's questions."
-        ),
-        roles=["user", "assistant"],
-        intra_message_sep="\n### ",
-        inter_message_sep="\n### ",
-    )
     llm_family = LLMFamilyV1(
         version=1,
         model_type="LLM",
@@ -611,7 +586,9 @@ def test_skip_download_gguf():
         model_lang=["en"],
         model_ability=["embed", "generate"],
         model_specs=[hf_spec, ms_spec],
-        prompt_style=prompt_style,
+        chat_template="xyz",
+        stop_token_ids=[1, 2, 3],
+        stop=["hello", "world"],
     )
 
     cache_dir = _get_cache_dir(llm_family, hf_spec)
@@ -686,7 +663,9 @@ def test_get_cache_status_pytorch():
         model_lang=["en"],
         model_ability=["embed", "generate"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     cache_status = get_cache_status(llm_family=family, llm_spec=spec)
@@ -722,7 +701,9 @@ def test_get_cache_status_gguf():
         model_lang=["en"],
         model_ability=["chat"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     cache_status = get_cache_status(llm_family=family, llm_spec=spec)
@@ -741,13 +722,13 @@ def test_get_cache_status_gguf():
     shutil.rmtree(cache_dir)
 
 
-def test_parse_prompt_style():
+def test_parse_chat_template():
     from ..llm_family import BUILTIN_LLM_PROMPT_STYLE
 
     assert len(BUILTIN_LLM_PROMPT_STYLE) > 0
     # take some examples to assert
     assert "qwen-chat" in BUILTIN_LLM_PROMPT_STYLE
-    assert "chatglm3" in BUILTIN_LLM_PROMPT_STYLE
+    assert "glm4-chat" in BUILTIN_LLM_PROMPT_STYLE
     assert "baichuan-2-chat" in BUILTIN_LLM_PROMPT_STYLE
 
     hf_spec = LlamaCppLLMSpecV1(
@@ -776,8 +757,8 @@ def test_parse_prompt_style():
         model_lang=["en"],
         model_ability=["chat", "generate"],
         model_specs=[hf_spec, ms_spec],
-        model_family="chatglm3",
-        prompt_style="chatglm3",
+        model_family="glm4-chat",
+        chat_template="glm4-chat",
     )
     model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8"))
     assert model_spec.model_name == llm_family.model_name
@@ -791,7 +772,7 @@ def test_parse_prompt_style():
         model_ability=["chat", "generate"],
         model_specs=[hf_spec, ms_spec],
         model_family="qwen-vl-chat",
-        prompt_style="qwen-vl-chat",
+        chat_template="qwen-vl-chat",
     )
     model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf-8"))
     assert "vision" in model_spec.model_ability
@@ -804,12 +785,12 @@ def test_parse_prompt_style():
         model_lang=["en"],
         model_ability=["chat", "generate"],
         model_specs=[hf_spec, ms_spec],
-        prompt_style="chatglm3",
+        chat_template="glm4-chat",
     )
     with pytest.raises(ValueError):
         CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8"))
 
-    # wrong model_family
+    # successful new model family
     llm_family = CustomLLMFamilyV1(
         version=1,
         model_type="LLM",
@@ -818,12 +799,20 @@ def test_parse_prompt_style():
         model_ability=["chat", "generate"],
         model_family="xyzz",
         model_specs=[hf_spec, ms_spec],
-        prompt_style="chatglm3",
+        chat_template="glm4-chat",
     )
-    with pytest.raises(ValueError):
-        CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8"))
+    model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8"))
+    assert (
+        model_spec.chat_template
+        == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["chat_template"]
+    )
+    assert (
+        model_spec.stop_token_ids
+        == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop_token_ids"]
+    )
+    assert model_spec.stop == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop"]
 
-    # error: wrong prompt style
+    # when chat_template is None, chat_template = model_family
     llm_family = CustomLLMFamilyV1(
         version=1,
         model_type="LLM",
@@ -831,11 +820,19 @@ def test_parse_prompt_style():
         model_lang=["en"],
         model_ability=["chat", "generate"],
         model_specs=[hf_spec, ms_spec],
-        model_family="chatglm3",
-        prompt_style="test_xyz",
+        model_family="glm4-chat",
+        chat_template=None,
     )
-    with pytest.raises(ValueError):
-        CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8"))
+    model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8"))
+    assert (
+        model_spec.chat_template
+        == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["chat_template"]
+    )
+    assert (
+        model_spec.stop_token_ids
+        == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop_token_ids"]
+    )
+    assert model_spec.stop == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop"]
 
 
 def test_match_model_size():
@@ -1073,7 +1070,9 @@ def test_query_engine_general():
         model_lang=["en"],
         model_ability=["chat"],
         model_specs=[spec],
-        prompt_style=None,
+        chat_template=None,
+        stop_token_ids=None,
+        stop=None,
     )
 
     register_llm(family, False)
@@ -1107,15 +1106,9 @@ def test_query_engine_general():
         model_lang=["en", "zh"],
         model_ability=["generate", "chat"],
         model_specs=[spec],
-        prompt_style={
-            "style_name": "QWEN",
-            "system_prompt": "You are a helpful assistant.",
-            "roles": ["user", "assistant"],
-            "intra_message_sep": "\n",
-            "inter_message_sep": "",
-            "stop": ["<|endoftext|>", "<|im_start|>", "<|im_end|>"],
-            "stop_token_ids": [151643, 151644, 151645],
-        },
+        chat_template="test",
+        stop=["<|endoftext|>", "<|im_start|>", "<|im_end|>"],
+        stop_token_ids=[151643, 151644, 151645],
     )
 
     register_llm(family, False)
diff --git a/xinference/model/llm/tests/test_multimodal.py b/xinference/model/llm/tests/test_multimodal.py
index 567e0d0355..7bd3e78a15 100644
--- a/xinference/model/llm/tests/test_multimodal.py
+++ b/xinference/model/llm/tests/test_multimodal.py
@@ -34,16 +34,21 @@ def test_restful_api_for_qwen_vl(setup, model_format, quantization):
         quantization=quantization,
     )
     model = client.get_model(model_uid)
-    prompt = [
-        {"type": "text", "text": "What’s in this image?"},
+    messages = [
         {
-            "type": "image_url",
-            "image_url": {
-                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-            },
-        },
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                    },
+                },
+            ],
+        }
     ]
-    response = model.chat(prompt=prompt)
+    response = model.chat(messages)
     assert "grass" in response["choices"][0]["message"]["content"]
     assert "tree" in response["choices"][0]["message"]["content"]
     assert "sky" in response["choices"][0]["message"]["content"]
@@ -141,16 +146,21 @@ def test_restful_api_for_yi_vl(setup, model_format, quantization):
         quantization=quantization,
     )
     model = client.get_model(model_uid)
-    prompt = [
-        {"type": "text", "text": "What’s in this image?"},
+    messages = [
         {
-            "type": "image_url",
-            "image_url": {
-                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-            },
-        },
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                    },
+                },
+            ],
+        }
     ]
-    response = model.chat(prompt=prompt)
+    response = model.chat(messages)
     assert "green" in response["choices"][0]["message"]["content"]
     assert "tree" in response["choices"][0]["message"]["content"]
     assert "sky" in response["choices"][0]["message"]["content"]
@@ -225,16 +235,21 @@ def test_restful_api_for_deepseek_vl(setup, model_format, quantization):
         temperature=0.0,
     )
     model = client.get_model(model_uid)
-    prompt = [
-        {"type": "text", "text": "What’s in this image?"},
+    messages = [
         {
-            "type": "image_url",
-            "image_url": {
-                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-            },
-        },
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                    },
+                },
+            ],
+        }
     ]
-    response = model.chat(prompt=prompt)
+    response = model.chat(messages)
     assert any(
         green in response["choices"][0]["message"]["content"]
         for green in ["grass", "green"]
diff --git a/xinference/model/llm/tests/test_utils.py b/xinference/model/llm/tests/test_utils.py
index 42125a0048..9d12d695ca 100644
--- a/xinference/model/llm/tests/test_utils.py
+++ b/xinference/model/llm/tests/test_utils.py
@@ -12,309 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ....types import ChatCompletionMessage
-from ..llm_family import PromptStyleV1
-from ..utils import ChatModelMixin
-
-
-def test_prompt_style_add_colon_single():
-    prompt_style = PromptStyleV1(
-        style_name="ADD_COLON_SINGLE",
-        system_prompt=(
-            "A chat between a curious human and an artificial intelligence assistant. The "
-            "assistant gives helpful, detailed, and polite answers to the human's questions."
-        ),
-        roles=["user", "assistant"],
-        intra_message_sep="\n### ",
-    )
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "A chat between a curious human and an artificial intelligence assistant. The assistant"
-        " gives helpful, detailed, and polite answers to the human's questions."
-        "\n### user: Hi there."
-        "\n### assistant: Hello, how may I help you?"
-        "\n### user: Write a poem."
-        "\n### assistant:"
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_no_colon_two():
-    prompt_style = PromptStyleV1(
-        style_name="NO_COLON_TWO",
-        system_prompt="",
-        roles=[" <reserved_102> ", " <reserved_103> "],
-        intra_message_sep="",
-        inter_message_sep="</s>",
-        stop_token_ids=[2, 195],
-    )
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        " <reserved_102> Hi there."
-        " <reserved_103> Hello, how may I help you?</s>"
-        " <reserved_102> Write a poem."
-        " <reserved_103> "
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_llama2():
-    prompt_style = PromptStyleV1(
-        style_name="LLAMA2",
-        system_prompt=(
-            "<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer"
-            " as helpfully as possible, while being safe. Your answers should not include any"
-            " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please"
-            " ensure that your responses are socially unbiased and positive in nature.\n\nIf a"
-            " question does not make any sense, or is not factually coherent, explain why instead"
-            " of answering something not correct. If you don't know the answer to a question,"
-            " please don't share false information.\n<</SYS>>\n\n"
-        ),
-        roles=["[INST]", "[/INST]"],
-        intra_message_sep=" ",
-        inter_message_sep=" </s><s>",
-        stop_token_ids=[2],
-    )
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer"
-        " as helpfully as possible, while being safe. Your answers should not include any"
-        " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please"
-        " ensure that your responses are socially unbiased and positive in nature.\n\nIf a"
-        " question does not make any sense, or is not factually coherent, explain why instead"
-        " of answering something not correct. If you don't know the answer to a question,"
-        " please don't share false information.\n<</SYS>>\n\nHi there.[/INST] Hello, how may I help"
-        " you? </s><s>[INST] Write a poem. [/INST]"
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_llama3():
-    prompt_style = PromptStyleV1(
-        style_name="LLAMA3",
-        system_prompt=(
-            "You are a helpful, respectful and honest assistant. Always answer"
-            " as helpfully as possible, while being safe. Your answers should not include any"
-            " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please"
-            " ensure that your responses are socially unbiased and positive in nature.\n\nIf a"
-            " question does not make any sense, or is not factually coherent, explain why instead"
-            " of answering something not correct. If you don't know the answer to a question,"
-            " please don't share false information"
-        ),
-        roles=["user", "assistant"],
-        intra_message_sep="\n\n",
-        inter_message_sep="<|eot_id|>",
-        stop_token_ids=[128001, 128009],
-    )
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
-        "You are a helpful, respectful and honest assistant. Always answer"
-        " as helpfully as possible, while being safe. Your answers should not include any"
-        " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please"
-        " ensure that your responses are socially unbiased and positive in nature.\n\nIf a"
-        " question does not make any sense, or is not factually coherent, explain why instead"
-        " of answering something not correct. If you don't know the answer to a question,"
-        " please don't share false information<|eot_id|>"
-        "<|start_header_id|>user<|end_header_id|>\n\nHi there.<|eot_id|>"
-        "<|start_header_id|>assistant<|end_header_id|>\n\nHello, how may I help you?<|eot_id|>"
-        "<|start_header_id|>user<|end_header_id|>\n\nWrite a poem.<|eot_id|>"
-        "<|start_header_id|>assistant<|end_header_id|>\n\n"
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_chatglm_v3():
-    prompt_style = PromptStyleV1(
-        style_name="CHATGLM3",
-        system_prompt="",
-        roles=["user", "assistant"],
-    )
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "<|user|>\n Hi there.\n"
-        "<|assistant|>\n Hello, how may I help you?\n"
-        "<|user|>\n Write a poem.\n"
-        "<|assistant|>"
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_xverse():
-    prompt_style = PromptStyleV1(
-        style_name="XVERSE",
-        system_prompt="",
-        roles=["user", "assistant"],
-    )
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "<|user|> \n Hi there."
-        "<|assistant|> \n Hello, how may I help you?"
-        "<|user|> \n Write a poem."
-        "<|assistant|>"
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_qwen():
-    prompt_style = PromptStyleV1(
-        style_name="QWEN",
-        system_prompt="You are a helpful assistant.",
-        roles=["user", "assistant"],
-        intra_message_sep="\n",
-    )
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHi there."
-        "<|im_end|>\n<|im_start|>assistant\nHello, how may I help you?<|im_end|>\n<|im_start|>"
-        "user\nWrite a poem.<|im_end|>\n<|im_start|>assistant\n"
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_chatml():
-    prompt_style = PromptStyleV1(
-        style_name="CHATML",
-        system_prompt="<system>You are a wonderful code assistant\n",
-        roles=["<|user|>", "<|assistant|>"],
-        intra_message_sep="<|end|>",
-    )
-
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-
-    expected = (
-        "<system>You are a wonderful code assistant\n"
-        "<|end|>\n"
-        "<|user|>\n"
-        "Hi there.<|end|>\n"
-        "<|assistant|>\n"
-        "Hello, how may I help you?<|end|>\n"
-        "<|user|>\n"
-        "Write me a HelloWorld Function<|end|>\n"
-        "<|assistant|>\n"
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write me a HelloWorld Function", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_add_colon_single_cot():
-    prompt_style = PromptStyleV1(
-        style_name="ADD_COLON_SINGLE_COT",
-        system_prompt=(
-            "Below is an instruction that describes a task. Write a response that appropriately "
-            "completes the request."
-        ),
-        roles=["Instruction", "Response"],
-        intra_message_sep="\n\n### ",
-    )
-
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "Below is an instruction that describes a task. Write a response that appropriately "
-        "completes the request."
-        "\n\n### Instruction: Hi there."
-        "\n\n### Response: Hello, how may I help you?"
-        "\n\n### Instruction: Write a poem."
-        "\n\n### Response: Let's think step by step."
-    )
-    assert expected == ChatModelMixin.get_prompt(
-        "Write a poem.", chat_history, prompt_style
-    )
-
-
-def test_prompt_style_zephyr():
-    prompt_style = PromptStyleV1(
-        style_name="NO_COLON_TWO",
-        system_prompt=(
-            "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate.</s>\n"
-        ),
-        roles=["<|user|>\n", "<|assistant|>\n"],
-        intra_message_sep="</s>\n",
-        inter_message_sep="</s>\n",
-        stop_token_ids=[2, 195],
-        stop=["</s>"],
-    )
-
-    chat_history = [
-        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
-        ChatCompletionMessage(
-            role=prompt_style.roles[1], content="Hello, how may I help you?"
-        ),
-    ]
-    expected = (
-        "<|system|>\n"
-        "You are a friendly chatbot who always responds in the style of a pirate.</s>\n"
-        "<|user|>\n"
-        "Hi there.</s>\n"
-        "<|assistant|>\n"
-        "Hello, how may I help you?</s>\n"
-        "<|user|>\n"
-        "Write a poem.</s>\n"
-        "<|assistant|>\n"
-    )
-    actual = ChatModelMixin.get_prompt("Write a poem.", chat_history, prompt_style)
-    assert expected == actual
-
 
 def test_is_valid_model_name():
     from ...utils import is_valid_model_name
diff --git a/xinference/model/llm/transformers/chatglm.py b/xinference/model/llm/transformers/chatglm.py
index 797402b220..723664cbcf 100644
--- a/xinference/model/llm/transformers/chatglm.py
+++ b/xinference/model/llm/transformers/chatglm.py
@@ -11,45 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import json
-import threading
-import time
+import typing
 import uuid
+from threading import Thread
 from typing import Any, Dict, Iterator, List, Optional, Union
 
 import torch
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList
 
 from ....core.scheduler import InferenceRequest
-from ....types import (
-    SPECIAL_TOOL_PROMPT,
-    ChatCompletion,
-    ChatCompletionChoice,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-    LoRA,
-    PytorchGenerateConfig,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import GLM4_TOOL_CALL_FAMILY
+from ..utils import (
+    GLM4_TOOL_CALL_FAMILY,
+    generate_chat_completion,
+    generate_completion_chunk,
+)
 from .core import PytorchChatModel, PytorchModelConfig
 
 
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(
-        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
-    ) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 198] = 5e4
-        return scores
-
-
 class ChatglmPytorchChatModel(PytorchChatModel):
     def __init__(
         self,
@@ -107,40 +87,28 @@ def match(
         if llm_spec.model_format != "pytorch":
             return False
         model_family = llm_family.model_family or llm_family.model_name
-        if "chatglm" not in model_family and "glm4" not in model_family:
+        if "glm4" not in model_family:
             return False
         if "chat" not in llm_family.model_ability:
             return False
         return True
 
-    def _handle_tools(self, chat_history, generate_config) -> bool:
+    def _handle_tools(self, messages, generate_config):
         """Convert openai tools to ChatGLM tools."""
+        if self.model_family.model_name not in GLM4_TOOL_CALL_FAMILY:
+            return None
         if generate_config is None:
-            return False
+            return None
         tools = generate_config.pop("tools", None)
         if tools is None:
-            return False
-        # Convert a iterable to a list
+            return None
+        # Convert an iterable to a list
         tools = list(tools)
         tool_choice = generate_config.pop("tool_choice", "none")
-        if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
-            chat_history[:] = self._process_messages(
-                chat_history, tools=tools, tool_choice=tool_choice
-            )
-            return True
-        else:
-            chatglm_tools = []
-            for elem in tools:
-                if elem.get("type") != "function" or "function" not in elem:
-                    raise ValueError("ChatGLM tools only support function type.")
-                chatglm_tools.append(elem["function"])
-            tool_prompt_message = {
-                "role": "system",
-                "content": f"Answer the following questions as best as you can. You have access to the following tools:",
-                "tools": chatglm_tools,
-            }
-            chat_history.insert(0, tool_prompt_message)
-            return True
+        messages[:] = self._process_messages(
+            messages, tools=tools, tool_choice=tool_choice
+        )
+        return tools
 
     @staticmethod
     def _process_messages(messages, tools=None, tool_choice="none"):
@@ -230,12 +198,70 @@ def _filter_tools(_tool_choice, _tools):
         return processed_messages
 
     @staticmethod
-    def _process_response(output, history, tools, end=False):
+    @typing.no_type_check
+    def _process_response_non_streaming(
+        output: str, tools: Union[Dict, List[Dict]] = None, use_tool: bool = False
+    ) -> Union[str, dict]:
+        """
+        Copied from https://github.com/THUDM/GLM-4/blob/main/basic_demo/openai_api_server.py#L150
+        """
+        import re
+
+        lines = output.strip().split("\n")
+        arguments_json = None
+        special_tools = ["cogview", "simple_browser"]
+        tools = {tool["function"]["name"] for tool in tools} if tools else {}
+
+        # 这是一个简单的工具比较函数，不能保证拦截所有非工具输出的结果，比如参数未对齐等特殊情况。
+        ##TODO 如果你希望做更多判断，可以在这里进行逻辑完善。
+
+        if len(lines) >= 2 and lines[1].startswith("{"):
+            function_name = lines[0].strip()
+            arguments = "\n".join(lines[1:]).strip()
+            if function_name in tools or function_name in special_tools:
+                try:
+                    arguments_json = json.loads(arguments)
+                    is_tool_call = True
+                except json.JSONDecodeError:
+                    is_tool_call = function_name in special_tools
+
+                if is_tool_call and use_tool:
+                    content = {
+                        "name": function_name,
+                        "arguments": json.dumps(
+                            arguments_json
+                            if isinstance(arguments_json, dict)
+                            else arguments,
+                            ensure_ascii=False,
+                        ),
+                    }
+                    if function_name == "simple_browser":
+                        search_pattern = re.compile(
+                            r'search\("(.+?)"\s*,\s*recency_days\s*=\s*(\d+)\)'
+                        )
+                        match = search_pattern.match(arguments)
+                        if match:
+                            content["arguments"] = json.dumps(
+                                {
+                                    "query": match.group(1),
+                                    "recency_days": int(match.group(2)),
+                                },
+                                ensure_ascii=False,
+                            )
+                    elif function_name == "cogview":
+                        content["arguments"] = json.dumps(
+                            {"prompt": arguments}, ensure_ascii=False
+                        )
+
+                    return content
+        return output.strip()
+
+    @staticmethod
+    def _process_response_streaming(output, tools, end=False):
         # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py
         content = ""
-        history = copy.deepcopy(history)
         if not tools and end:
-            return None, None
+            return None
         for response in output.split("<|assistant|>"):
             if "\n" in response:
                 metadata, content = response.split("\n", maxsplit=1)
@@ -244,204 +270,53 @@ def _process_response(output, history, tools, end=False):
             if not metadata.strip():
                 if tools and any(t.startswith(response) for t in tools) and not end:
                     # Waiting for tool call complete.
-                    return None, None
+                    return None
                 content = content.strip()
-                history.append(
-                    {"role": "assistant", "metadata": metadata, "content": content}
-                )
                 content = content.replace("[[训练时间]]", "2023年")
             else:
                 if tools and metadata in tools and not end:
-                    return None, None
-                history.append(
-                    {"role": "assistant", "metadata": metadata, "content": content}
-                )
+                    return None
                 metadata = metadata.strip()
                 if tools and metadata in tools and end:
                     try:
                         parameters = json.loads(content)
-                        content = {"name": metadata.strip(), "parameters": parameters}
+                        content = {"name": metadata.strip(), "arguments": parameters}
                     except json.JSONDecodeError:
                         content = {"name": metadata.strip(), "content": content}
                 else:
                     content = {"name": metadata.strip(), "content": content}
-        return content, history
-
-    def _get_generate_args(
-        self,
-        tokenizer,
-        query: str,
-        history: Optional[List[Dict]] = None,
-        role: str = "user",
-        past_key_values=None,
-        max_length: int = 8192,
-        do_sample=True,
-        top_p=0.8,
-        temperature=0.8,
-        logits_processor=None,
-        **kwargs,
-    ):
-        # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        eos_token_id = [
-            tokenizer.eos_token_id,
-            tokenizer.convert_tokens_to_ids("<|user|>"),
-            tokenizer.convert_tokens_to_ids("<|observation|>"),
-        ]
-        gen_kwargs = {
-            "max_length": max_length,
-            "do_sample": do_sample,
-            "top_p": top_p,
-            "temperature": temperature,
-            "logits_processor": logits_processor,
-            **kwargs,
-        }
-        if past_key_values is None:
-            inputs = tokenizer.apply_chat_template(
-                history + [{"role": role, "content": query}],
-                add_generation_prompt=True,
-                tokenize=True,
-                return_tensors="pt",
-                return_dict=True,
-            )
-        else:
-            inputs = tokenizer.apply_chat_template(
-                [{"role": role, "content": query}],
-                add_special_tokens=False,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_tensors="pt",
-                return_dict=True,
-            )
-        inputs = inputs.to(self._model.device)
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-            inputs.position_ids += past_length
-            attention_mask = inputs.attention_mask
-            attention_mask = torch.cat(
-                (attention_mask.new_ones(1, past_length), attention_mask), dim=1
-            )
-            inputs["attention_mask"] = attention_mask
-        history.append({"role": role, "content": query})
-        tools = history[0]["role"] == "system" and history[0].get("tools")
-        tools = (
-            [
-                t.get("function", {}).get("name", "")
-                for t in tools
-                if isinstance(t, dict)
-            ]
-            if tools
-            else []
-        )
-        kwargs = dict(inputs)
-        kwargs["past_key_values"] = past_key_values
-        kwargs["eos_token_id"] = eos_token_id
-        kwargs.update(gen_kwargs)
-        return kwargs, tools
+        return content
 
     @torch.inference_mode()
-    def _stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: Optional[List[Dict]] = None,
-        role: str = "user",
-        past_key_values=None,
-        max_length: int = 8192,
-        do_sample=True,
-        top_p=0.8,
-        temperature=0.8,
-        logits_processor=None,
-        **kwargs,
-    ):
+    def _stream_chat(self, inputs, tools, **kwargs):
         from transformers import TextIteratorStreamer
 
-        kwargs, tools = self._get_generate_args(
-            tokenizer=tokenizer,
-            query=query,
-            history=history,
-            role=role,
-            past_key_values=past_key_values,
-            max_length=max_length,
-            do_sample=do_sample,
-            top_p=top_p,
-            temperature=temperature,
-            logits_processor=logits_processor,
-            **kwargs,
-        )
-
         streamer = TextIteratorStreamer(
-            tokenizer, skip_prompt=True, skip_special_tokens=True
+            self._tokenizer, skip_prompt=True, skip_special_tokens=True
         )
-        kwargs["streamer"] = streamer
-        thread = threading.Thread(target=self._model.generate, kwargs=kwargs)
+        tools = {tool["function"]["name"] for tool in tools} if tools else {}
+        generation_kwargs = dict(inputs, streamer=streamer)
+        generation_kwargs.update(kwargs)
+        thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
         thread.start()
 
         response = ""
         for token in streamer:
             response += token
             if response and response[-1] != "�":
-                new_response, new_history = self._process_response(
-                    response, history, tools, end=False
+                new_response = self._process_response_streaming(
+                    response, tools, end=False
                 )
                 if new_response is None:
                     continue
-                yield new_response, new_history
+                yield new_response
         if tools:
-            new_response, new_history = self._process_response(
-                response, history, tools, end=True
-            )
+            new_response = self._process_response_streaming(response, tools, end=True)
             if new_response:
-                yield new_response, new_history
-
-    @torch.inference_mode()
-    def _non_stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: Optional[List[Dict]] = None,
-        role: str = "user",
-        past_key_values=None,
-        max_length: int = 8192,
-        do_sample=True,
-        top_p=0.8,
-        temperature=0.8,
-        logits_processor=None,
-        **kwargs,
-    ):
-        kwargs, tools = self._get_generate_args(
-            tokenizer=tokenizer,
-            query=query,
-            history=history,
-            role=role,
-            past_key_values=past_key_values,
-            max_length=max_length,
-            do_sample=do_sample,
-            top_p=top_p,
-            temperature=temperature,
-            logits_processor=logits_processor,
-            **kwargs,
-        )
-
-        outputs = self._model.generate(**kwargs)
-        outputs = outputs[:, kwargs["input_ids"].shape[1] :]
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        if tools:
-            return self._process_response(response, history, tools, end=True)
-        else:
-            return self._process_response(response, history, tools)
+                yield new_response
 
-    def chat(
-        self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
-        generate_config: Optional[PytorchGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+    @staticmethod
+    def _get_generate_kwargs(generate_config):
         kwargs: Dict[str, Any] = {}
         generate_config = generate_config or {}
         temperature = generate_config.get("temperature")
@@ -453,18 +328,26 @@ def chat(
         max_new_tokens = generate_config.get("max_tokens")
         if max_new_tokens is not None:
             kwargs["max_new_tokens"] = int(max_new_tokens)
-        chat_history = chat_history or []
-        tools = self._handle_tools(chat_history, generate_config)
-        # Tool calls only works for non stream, so we call chat directly.
-        if prompt == SPECIAL_TOOL_PROMPT and chat_history:
-            tool_message = chat_history.pop()
-            content = tool_message.get("content")
-            assert content is not None
-            prompt = content
-            kwargs["role"] = "observation"
-            chat_history = [h for h in chat_history if not h.get("tool_calls")]
-        if system_prompt:
-            chat_history.append({"role": "system", "content": system_prompt})
+        do_sample = generate_config.get("do_sample")
+        if do_sample is not None:
+            kwargs["do_sample"] = bool(do_sample)
+        top_k = generate_config.get("top_k")
+        if top_k is not None:
+            kwargs["top_k"] = top_k
+        repetition_penalty = generate_config.get("repetition_penalty")
+        if repetition_penalty is not None:
+            kwargs["repetition_penalty"] = repetition_penalty
+        return kwargs
+
+    def chat(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        generate_config = generate_config or {}
+        kwargs: Dict[str, Any] = self._get_generate_kwargs(generate_config)
+        tools = self._handle_tools(messages, generate_config)
+        has_tools = tools is not None
         stream = generate_config.get("stream", False)
         stream_options = generate_config.pop("stream_options", None)
         include_usage = (
@@ -472,103 +355,82 @@ def chat(
             if isinstance(stream_options, dict)
             else False
         )
-        if stream and (
-            not tools or self.model_family.model_name in GLM4_TOOL_CALL_FAMILY
-        ):
+        inputs = self._tokenizer.apply_chat_template(
+            messages,
+            return_tensors="pt",
+            chat_template=self.model_family.chat_template,
+            add_generation_prompt=True,
+            return_dict=True,
+        )
+        inputs = inputs.to(self._model.device)
+
+        if not stream:
+            with torch.no_grad():
+                outputs = self._model.generate(**inputs, **kwargs)
+                outputs = outputs[:, inputs["input_ids"].shape[1] :]
+                response = self._tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # In some cases, the response starts with `\n`
+                if response.startswith("\n"):
+                    response = response[1:]
+            if has_tools:
+                function_call = self._process_response_non_streaming(
+                    response, tools, use_tool=True
+                )
+                return self._tool_calls_completion(
+                    self.model_family, self.model_uid, function_call
+                )
+            else:
+                return generate_chat_completion(self.model_uid, response)
+        else:
 
             def _stream_generator():
                 last_chunk_text_length = 0
                 chunk_id = "chat-" + str(uuid.uuid1())
                 prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
-                inputs = self._tokenizer([prompt], return_tensors="pt")
-                inputs = inputs.to(self._model.device)
                 prompt_tokens = len(inputs["input_ids"][0])
-                for chunk_text, _ in self._stream_chat(
-                    self._tokenizer, prompt, chat_history, **kwargs
-                ):
+                for chunk_text in self._stream_chat(inputs, tools, **kwargs):
                     if tools and isinstance(chunk_text, dict):
                         yield self._tool_calls_completion_chunk(
-                            self.model_family, self.model_uid, [chunk_text, _], tools
+                            self.model_family, self.model_uid, chunk_text
                         )
                         return
                     completion_tokens = completion_tokens + 1
                     total_tokens = prompt_tokens + completion_tokens
                     chunk_text = chunk_text[last_chunk_text_length:]
                     last_chunk_text_length += len(chunk_text)
-                    completion_choice = CompletionChoice(
-                        text=chunk_text, index=0, logprobs=None, finish_reason=None
-                    )
-                    yield CompletionChunk(
-                        id=chunk_id,
-                        object="text_completion",
-                        created=int(time.time()),
-                        model=self.model_uid,
-                        choices=[completion_choice],
-                        usage=CompletionUsage(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens,
-                            total_tokens=total_tokens,
-                        ),
+                    yield generate_completion_chunk(
+                        chunk_text,
+                        finish_reason=None,
+                        chunk_id=chunk_id,
+                        model_uid=self.model_uid,
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
                     )
-                completion_choice = CompletionChoice(
-                    text="", index=0, logprobs=None, finish_reason="stop"
-                )
-                chunk = CompletionChunk(
-                    id=chunk_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=self.model_uid,
-                    choices=[completion_choice],
-                )
-                completion_usage = CompletionUsage(
+                yield generate_completion_chunk(
+                    None,
+                    finish_reason="stop",
+                    chunk_id=chunk_id,
+                    model_uid=self.model_uid,
                     prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
+                    has_choice=True,
+                    has_content=False,
                 )
-                chunk["usage"] = completion_usage
-                yield chunk
                 if include_usage:
-                    chunk = CompletionChunk(
-                        id=chunk_id,
-                        object="text_completion",
-                        created=int(time.time()),
-                        model=self.model_uid,
-                        choices=[],
-                    )
-                    chunk["usage"] = CompletionUsage(
+                    yield generate_completion_chunk(
+                        None,
+                        finish_reason=None,
+                        chunk_id=chunk_id,
+                        model_uid=self.model_uid,
                         prompt_tokens=prompt_tokens,
                         completion_tokens=completion_tokens,
                         total_tokens=total_tokens,
+                        has_choice=False,
                     )
-                    yield chunk
 
             return self._to_chat_completion_chunks(_stream_generator())
-        else:
-            response = self._non_stream_chat(
-                self._tokenizer, prompt, chat_history, **kwargs
-            )
-            if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, response, tools
-                )
-            else:
-                content, _ = response
-                return ChatCompletion(
-                    id="chat" + str(uuid.uuid1()),
-                    object="chat.completion",
-                    created=int(time.time()),
-                    model=self.model_uid,
-                    choices=[
-                        ChatCompletionChoice(
-                            index=0,
-                            message={"role": "assistant", "content": content},
-                            finish_reason="stop",
-                        )
-                    ],
-                    usage=CompletionUsage(
-                        prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                    ),
-                )
 
     def prepare_sanitize_generate_config(self, req: InferenceRequest):
         """
diff --git a/xinference/model/llm/transformers/cogvlm2.py b/xinference/model/llm/transformers/cogvlm2.py
index 79b15be69c..f3c27454d9 100644
--- a/xinference/model/llm/transformers/cogvlm2.py
+++ b/xinference/model/llm/transformers/cogvlm2.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, Iterator, List, Optional, Tuple, Union
@@ -21,17 +20,14 @@
 
 from ....core.scheduler import InferenceRequest
 from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import _decode_image
+from ..utils import (
+    _decode_image,
+    generate_chat_completion,
+    generate_completion_chunk,
+    parse_messages,
+)
 from .core import PytorchChatModel, PytorchGenerateConfig
 from .utils import get_max_src_len
 
@@ -139,9 +135,7 @@ def _message_content_to_cogvlm2(self, content):
                 )
         return content, None
 
-    def _history_content_to_cogvlm2(
-        self, system_prompt: str, chat_history: List[ChatCompletionMessage]
-    ):
+    def _history_content_to_cogvlm2(self, system_prompt: str, chat_history: List[Dict]):
         query = system_prompt
         history: List[Tuple] = []
         pixel_values = None
@@ -163,7 +157,7 @@ def get_query_and_history(
         self,
         prompt: Union[str, List[Dict]],
         system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        chat_history: Optional[List[Dict]] = None,
     ):
         content, image = self._message_content_to_cogvlm2(prompt)
 
@@ -184,12 +178,12 @@ def get_query_and_history(
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        system_prompt = system_prompt if system_prompt else ""
+        system_prompt = ""
+        if messages[0]["role"] == "system":
+            system_prompt = messages[0]["content"]
         stream = generate_config.get("stream", False) if generate_config else False
 
         sanitized_config = {
@@ -199,6 +193,7 @@ def chat(
             else 512,
         }
 
+        prompt, _, chat_history = parse_messages(messages)
         query, image, history = self.get_query_and_history(
             prompt, system_prompt=system_prompt, chat_history=chat_history
         )
@@ -236,21 +231,7 @@ def chat(
                 response = self._tokenizer.decode(outputs[0])
                 response = response.split("<|end_of_text|>")[0]
 
-            chunk = Completion(
-                id=str(uuid.uuid1()),
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    CompletionChoice(
-                        index=0, text=response, finish_reason="stop", logprobs=None
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
-            )
-            return self._to_chat_completion(chunk)
+            return generate_chat_completion(self.model_uid, response)
 
     def _streaming_chat_response(
         self, inputs: Dict, config: Dict
@@ -277,36 +258,26 @@ def _streaming_chat_response(
 
         completion_id = str(uuid.uuid1())
         for new_text in streamer:
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    CompletionChoice(
-                        index=0, text=new_text, finish_reason=None, logprobs=None
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=-1,
+                completion_tokens=-1,
+                total_tokens=-1,
             )
-            yield chunk
-
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-            usage=CompletionUsage(
-                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-            ),
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+            has_choice=True,
+            has_content=False,
         )
-        yield chunk
 
     @staticmethod
     def build_position_ids(x, attention_mask=None):
@@ -341,7 +312,9 @@ def build_position_ids(x, attention_mask=None):
     def get_dtype(self):
         return self._torch_type
 
-    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
+    def _get_full_prompt(self, messages: List[Dict], tools):
+        prompt, system_prompt, chat_history = parse_messages(messages)
+        system_prompt = system_prompt or ""
         query, image, history = self.get_query_and_history(
             prompt, system_prompt=system_prompt, chat_history=chat_history
         )
diff --git a/xinference/model/llm/transformers/cogvlm2_video.py b/xinference/model/llm/transformers/cogvlm2_video.py
index 24f31e0b5c..f39119f7aa 100644
--- a/xinference/model/llm/transformers/cogvlm2_video.py
+++ b/xinference/model/llm/transformers/cogvlm2_video.py
@@ -12,28 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, Iterator, List, Optional, Tuple, Union
 
 import torch
 
-from ....core.scheduler import InferenceRequest
 from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import _decode_image
+from ..utils import (
+    _decode_image,
+    generate_chat_completion,
+    generate_completion_chunk,
+    parse_messages,
+)
 from .core import PytorchChatModel, PytorchGenerateConfig
-from .utils import get_max_src_len
 
 logger = logging.getLogger(__name__)
 
@@ -170,9 +164,7 @@ def _message_content_to_cogvlm2(self, content):
             return text, images, video
         return content, [], None
 
-    def _history_content_to_cogvlm2(
-        self, system_prompt: str, chat_history: List[ChatCompletionMessage]
-    ):
+    def _history_content_to_cogvlm2(self, system_prompt: str, chat_history: List[Dict]):
         query = system_prompt
         history: List[Tuple] = []
         pixel_values = None
@@ -202,7 +194,7 @@ def get_query_and_history(
         self,
         prompt: Union[str, List[Dict]],
         system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        chat_history: Optional[List[Dict]] = None,
     ):
         content, image, video = self._message_content_to_cogvlm2(prompt)
 
@@ -237,12 +229,12 @@ def get_query_and_history(
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        system_prompt = system_prompt if system_prompt else ""
+        system_prompt = ""
+        if messages[0]["role"] == "system":
+            system_prompt = messages[0]["content"]
         stream = generate_config.get("stream", False) if generate_config else False
 
         sanitized_config = {
@@ -252,6 +244,7 @@ def chat(
             else 512,
         }
 
+        prompt, _, chat_history = parse_messages(messages)
         query, image, video, history = self.get_query_and_history(
             prompt, system_prompt=system_prompt, chat_history=chat_history
         )
@@ -292,21 +285,7 @@ def chat(
                 response = self._tokenizer.decode(outputs[0])
                 response = response.split("<|end_of_text|>")[0]
 
-            chunk = Completion(
-                id=str(uuid.uuid1()),
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    CompletionChoice(
-                        index=0, text=response, finish_reason="stop", logprobs=None
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
-            )
-            return self._to_chat_completion(chunk)
+            return generate_chat_completion(self.model_uid, response)
 
     def _streaming_chat_response(
         self, inputs: Dict, config: Dict
@@ -333,192 +312,23 @@ def _streaming_chat_response(
 
         completion_id = str(uuid.uuid1())
         for new_text in streamer:
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    CompletionChoice(
-                        index=0, text=new_text, finish_reason=None, logprobs=None
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=-1,
+                completion_tokens=-1,
+                total_tokens=-1,
             )
-            yield chunk
-
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-            usage=CompletionUsage(
-                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-            ),
-        )
-        yield chunk
-
-    @staticmethod
-    def build_position_ids(x, attention_mask=None):
-        """
-        Copied from https://huggingface.co/THUDM/cogvlm2-llama3-chinese-chat-19B-int4/blob/main/modeling_cogvlm.py
-        """
-        # Fix: 参考官方开源代码
-        if attention_mask is not None:
-            tmp = x.clone()
-            tmp[~(attention_mask.bool())] = -1
-        else:
-            tmp = x.clone()
-        # image boi eoi token as LANGUAGE_TOKEN_TYPE
-        is_boi_eoi = torch.zeros_like(x, dtype=torch.bool)
-        is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & (
-            tmp[:, :-1] == LANGUAGE_TOKEN_TYPE
-        )
-        is_boi_eoi[:, 0] |= tmp[:, 0] == VISION_TOKEN_TYPE
-        is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & (
-            tmp[:, 1:] == LANGUAGE_TOKEN_TYPE
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+            has_choice=True,
+            has_content=False,
         )
-        is_boi_eoi[:, -1] |= tmp[:, -1] == VISION_TOKEN_TYPE
-        tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE
-        # final position ids
-        y = torch.zeros_like(x, dtype=torch.long)
-        y[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | (
-            (tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)
-        )
-        y = y.cumsum(dim=-1)
-        return y
-
-    def get_dtype(self):
-        return self._torch_type
-
-    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
-        query, image, video, history = self.get_query_and_history(
-            prompt, system_prompt=system_prompt, chat_history=chat_history
-        )
-
-        if video:
-            image = [video]
-
-        input_by_model: dict = self._model.build_conversation_input_ids(  # type: ignore
-            self._tokenizer,
-            query=query,
-            history=history,
-            images=image,
-            template_version="chat",
-        )
-        return {
-            "input_ids": input_by_model["input_ids"],  # seq_len
-            "token_type_ids": input_by_model["token_type_ids"],  # seq_len
-            "attention_mask": input_by_model["attention_mask"],  # seq_len
-            "images": input_by_model["images"],
-        }
-
-    def prepare_sanitize_generate_config(self, req: InferenceRequest):
-        """
-        See https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B/blob/main/generation_config.json
-        """
-        raw_config = req.inference_kwargs.get("raw_params", {})
-        temperature = raw_config.get("temperature", None)
-        if temperature is None:
-            raw_config["temperature"] = 0.6
-        top_p = raw_config.get("top_p", None)
-        if top_p is None:
-            raw_config["top_p"] = 0.9
-        return raw_config
-
-    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
-        context_len = self.get_context_len()
-        assert isinstance(prompts[0], dict)
-        images = []
-        max_length = float("-inf")
-        for i, feature in enumerate(prompts):
-            req = req_list[i]
-            if "images" in feature:
-                images.append(feature.pop("images", None))
-            max_src_len = get_max_src_len(context_len, req)
-            input_ids = feature["input_ids"][-max_src_len:]
-            req.prompt_tokens = input_ids.tolist()
-            feature["input_ids"] = input_ids
-            feature["token_type_ids"] = feature["token_type_ids"][-max_src_len:]
-            feature["attention_mask"] = feature["attention_mask"][-max_src_len:]
-            req.extra_kwargs["attention_mask_seq_len"] = feature[
-                "attention_mask"
-            ].shape[0]
-            max_length = max(len(input_ids), max_length)
-
-        def pad_to_max_length_internal(feature, max_len, idx):
-            padding_length = max_len - len(feature["input_ids"])
-            req_list[idx].padding_len = padding_length
-            feature["input_ids"] = torch.cat(
-                [torch.full((padding_length,), 0), feature["input_ids"]]
-            )
-            feature["token_type_ids"] = torch.cat(
-                [
-                    torch.zeros(padding_length, dtype=torch.long),
-                    feature["token_type_ids"],
-                ]
-            )
-            feature["attention_mask"] = torch.cat(
-                [
-                    torch.zeros(padding_length, dtype=torch.long),
-                    feature["attention_mask"],
-                ]
-            )
-            return feature
-
-        features = [
-            pad_to_max_length_internal(feature, max_length, i)
-            for i, feature in enumerate(prompts)
-        ]
-        batch = {
-            key: torch.stack([feature[key] for feature in features])
-            for key in features[0].keys()
-        }
-
-        position_ids = self.build_position_ids(batch["token_type_ids"])
-        batch["position_ids"] = position_ids
-
-        for i in range(len(prompts)):
-            req = req_list[i]
-            req.extra_kwargs["max_position_id"] = position_ids[i : i + 1, -1].item()
-
-        if images:
-            batch["images"] = images
-
-        batch = recur_move_to(
-            batch, self._device, lambda x: isinstance(x, torch.Tensor)
-        )
-        dtype = self.get_dtype()
-        if dtype:
-            batch = recur_move_to(
-                batch,
-                dtype,
-                lambda x: isinstance(x, torch.Tensor) and torch.is_floating_point(x),
-            )
-        return batch
-
-    def build_decode_token_type_ids(
-        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
-    ):
-        token_type_ids = torch.full(
-            (batch_size, 1), fill_value=1, dtype=torch.long, device=self._device
-        )
-        return token_type_ids
-
-    def build_decode_position_ids(
-        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
-    ):
-        tmp = []
-        for r in reqs:
-            r.extra_kwargs["max_position_id"] += 1
-            tmp.append(r.extra_kwargs["max_position_id"])
-        position_ids = torch.as_tensor(
-            tmp, device=self._device, dtype=torch.long
-        ).unsqueeze(1)
-        return position_ids
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index b02f88e947..fd7d75b22e 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -16,7 +16,7 @@
 import logging
 import os
 from functools import lru_cache
-from typing import Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
 
 import torch
 
@@ -29,7 +29,6 @@
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChoice,
     CompletionChunk,
@@ -52,8 +51,6 @@
     "chatglm3-128k",
     "glm4-chat",
     "glm4-chat-1m",
-    "llama-2",
-    "llama-2-chat",
     "internlm2-chat",
     "internlm2.5-chat",
     "qwen-vl-chat",
@@ -615,12 +612,13 @@ def prepare_batch_inference(self, req_list: List[InferenceRequest]):
                 r.error_msg = str(e)
 
     def get_builtin_stop_token_ids(self) -> Tuple:
-        return (
-            tuple(self.model_family.prompt_style.stop_token_ids)
-            if self.model_family.prompt_style
-            and self.model_family.prompt_style.stop_token_ids
-            else tuple()
-        )
+        from ..utils import get_stop_token_ids_from_config_file
+
+        stop_token_ids = get_stop_token_ids_from_config_file(self.model_path)
+        if stop_token_ids is not None:
+            return tuple(stop_token_ids)
+        else:
+            return tuple(self.model_family.stop_token_ids)
 
     def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
         for req in req_list:
@@ -693,20 +691,13 @@ def _sanitize_generate_config(
         generate_config: Optional[PytorchGenerateConfig],
     ) -> PytorchGenerateConfig:
         generate_config = super()._sanitize_generate_config(generate_config)
-        if (
-            (not generate_config.get("stop"))
-            and self.model_family.prompt_style
-            and self.model_family.prompt_style.stop
-        ):
-            generate_config["stop"] = self.model_family.prompt_style.stop.copy()
+        if (not generate_config.get("stop")) and self.model_family.stop is not None:
+            generate_config["stop"] = self.model_family.stop.copy()
         if (
             generate_config.get("stop_token_ids", None) is None
-            and self.model_family.prompt_style
-            and self.model_family.prompt_style.stop_token_ids
+            and self.model_family.stop_token_ids is not None
         ):
-            generate_config[
-                "stop_token_ids"
-            ] = self.model_family.prompt_style.stop_token_ids.copy()
+            generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
 
         return generate_config
 
@@ -725,26 +716,22 @@ def match(
 
     def chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_prompt = self._get_full_prompt(prompt, system_prompt, chat_history, tools)
-
-        generate_config = self._sanitize_generate_config(generate_config)
-        # TODO(codingl2k1): qwen hacky to set stop for function call.
         model_family = self.model_family.model_family or self.model_family.model_name
+        full_context_kwargs = {}
         if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            stop = generate_config.get("stop")
-            if isinstance(stop, str):
-                generate_config["stop"] = [stop, "Observation:"]
-            elif isinstance(stop, Iterable):
-                assert not isinstance(stop, str)
-                generate_config["stop"] = list(stop) + ["Observation:"]
-            else:
-                generate_config["stop"] = "Observation:"
+            full_context_kwargs["tools"] = tools
+        full_prompt = self.get_full_context(
+            messages,
+            self.model_family.chat_template,
+            tokenizer=self._tokenizer,
+            **full_context_kwargs,
+        )
+
+        generate_config = self._sanitize_generate_config(generate_config)
 
         stream = generate_config.get("stream", False)
         if stream:
@@ -755,22 +742,15 @@ def chat(
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, c, tools
-                )
+                return self._tool_calls_completion(self.model_family, self.model_uid, c)
             return self._to_chat_completion(c)
 
     def load(self):
         super().load()
 
-    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-        chat_history = chat_history or []
-        full_prompt = ChatModelMixin.get_prompt(
-            prompt, chat_history, prompt_style, tools=tools
+    def _get_full_prompt(self, messages: List[Dict], tools):
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, tokenizer=self._tokenizer
         )
         return full_prompt
 
@@ -779,9 +759,7 @@ def prepare_batch_inference(self, req_list: List[InferenceRequest]):
         for r in req_list:
             try:
                 if not r.stopped and r.is_prefill:
-                    r.full_prompt = self._get_full_prompt(
-                        r.prompt, r.system_prompt, r.chat_history, None
-                    )
+                    r.full_prompt = self._get_full_prompt(r.prompt, None)
             except Exception as e:
                 logger.exception(f"prepare inference error with {e}")
                 r.stopped = True
diff --git a/xinference/model/llm/transformers/deepseek_vl.py b/xinference/model/llm/transformers/deepseek_vl.py
index d24158f5d4..cfec06b7d8 100644
--- a/xinference/model/llm/transformers/deepseek_vl.py
+++ b/xinference/model/llm/transformers/deepseek_vl.py
@@ -15,7 +15,6 @@
 import logging
 import os.path
 import tempfile
-import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
@@ -25,16 +24,9 @@
 import torch
 
 from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
 
 logger = logging.getLogger(__name__)
@@ -147,9 +139,7 @@ def _fill_placeholder(_url, _index):
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         if not generate_config:
@@ -162,44 +152,40 @@ def chat(
             if isinstance(stream_options, dict)
             else False
         )
-        prompt, images = self._message_content_to_deepseek(prompt)
-        prompt_messages: List[Dict[str, Any]] = [
-            {
-                "role": "User",
-                "content": prompt,
-            },
-            {"role": "Assistant", "content": ""},
-        ]
-        if images:
-            prompt_messages[0]["images"] = images
-
-        # Convert openai history to qwen vl history
-        deepseek_history = []
-        for h in chat_history or []:
-            role = h["role"]
+
+        prompt = ""
+        deepseek_messages = []
+        for i, message in enumerate(messages):
+            role = message["role"]
+            content = message["content"]
             if role == "user":
-                content, images = self._message_content_to_deepseek(h["content"])
-                msg: Dict[str, Any] = {
-                    "role": "User",
-                    "content": content,
-                }
-                if images:
-                    msg["images"] = images
-                deepseek_history.append(msg)
+                if isinstance(content, str):
+                    deepseek_messages.append({"role": "User", "content": content})
+                else:
+                    content, images = self._message_content_to_deepseek(content)
+                    msg: Dict[str, Any] = {
+                        "role": "User",
+                        "content": content,
+                    }
+                    if images:
+                        msg["images"] = images
+                    deepseek_messages.append(msg)
+                if i == len(messages) - 1:
+                    prompt = content
             elif role == "assistant":
-                deepseek_history.append({"role": "Assistant", "content": h["content"]})
+                deepseek_messages.append({"role": "Assistant", "content": content})
             else:
-                logger.error("Unexpected msg in chat history: %s", h)
-
-        deepseek_history.extend(prompt_messages)
+                logger.error(
+                    f"Unexpected message in messages: role: {role}, message: {message}"
+                )
 
         from ....thirdparty.deepseek_vl.serve.inference import generate
         from ....thirdparty.deepseek_vl.utils.io import load_pil_images
 
         # load images and prepare for inputs
-        pil_images = load_pil_images(deepseek_history)
+        pil_images = load_pil_images(deepseek_messages)
         prepare_inputs = self._vl_chat_processor(
-            conversations=deepseek_history, images=pil_images, force_batchify=True
+            conversations=deepseek_messages, images=pil_images, force_batchify=True
         ).to(self._model.device, self._model.dtype)
 
         temperature = generate_config.get("temperature", 0.2)
@@ -226,31 +212,16 @@ def chat(
             it = self._generate_stream(streamer, stop_str, include_usage, prompt)
             return self._to_chat_completion_chunks(it)
         else:
-            c = self._generate(streamer, stop_str)
-            return self._to_chat_completion(c)
+            return self._generate(streamer, stop_str)
 
-    def _generate(self, streamer, stop_str) -> Completion:
+    def _generate(self, streamer, stop_str) -> ChatCompletion:
         generated_text = ""
         for new_text in streamer:
             if new_text.endswith(stop_str):
                 new_text = new_text[: -len(stop_str)]
             generated_text += new_text
 
-        c = Completion(
-            id=str(uuid.uuid1()),
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[
-                CompletionChoice(
-                    index=0, text=generated_text, finish_reason="stop", logprobs=None
-                )
-            ],
-            usage=CompletionUsage(
-                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-            ),
-        )
-        return c
+        return generate_chat_completion(self.model_uid, generated_text)
 
     def _generate_stream(
         self, streamer, stop_str, include_usage, prompt
@@ -262,54 +233,40 @@ def _generate_stream(
         for i, new_text in enumerate(streamer):
             if new_text.endswith(stop_str):
                 new_text = new_text[: -len(stop_str)]
-            completion_choice = CompletionChoice(
-                text=new_text, index=0, logprobs=None, finish_reason=None
-            )
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[completion_choice],
-            )
             completion_tokens = i
             total_tokens = prompt_tokens + completion_tokens
-            completion_usage = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                has_choice=True,
+                has_content=True,
             )
-            chunk["usage"] = completion_usage
-            yield chunk
-
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
             total_tokens=total_tokens,
+            has_choice=True,
+            has_content=False,
         )
-        chunk["usage"] = completion_usage
-        yield chunk
+
         if include_usage:
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[],
-            )
-            chunk["usage"] = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                has_choice=False,
+                has_content=False,
             )
-            yield chunk
diff --git a/xinference/model/llm/transformers/glm4v.py b/xinference/model/llm/transformers/glm4v.py
index 4df4f9cd4d..c16a167688 100644
--- a/xinference/model/llm/transformers/glm4v.py
+++ b/xinference/model/llm/transformers/glm4v.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
 import typing
 import uuid
 from concurrent.futures import ThreadPoolExecutor
@@ -22,18 +21,10 @@
 import torch
 
 from ....core.scheduler import InferenceRequest
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import _decode_image
+from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
 from .utils import get_max_src_len
 
@@ -102,66 +93,45 @@ def load(self):
         self._tokenizer = tokenizer
         self._save_tensorizer()
 
-    def _message_content_to_chat(self, content):
-        if not isinstance(content, str):
-            texts = []
-            image_urls = []
-            for c in content:
-                c_type = c.get("type")
-                if c_type == "text":
-                    texts.append(c["text"])
-                elif c_type == "image_url":
-                    image_urls.append(c["image_url"]["url"])
-            image_futures = []
-            with ThreadPoolExecutor() as executor:
-                for image_url in image_urls:
-                    fut = executor.submit(_decode_image, image_url)
-                    image_futures.append(fut)
-            images = [fut.result() for fut in image_futures]
-            text = " ".join(texts)
-            if len(images) == 0:
-                return text, []
-            elif len(images) == 1:
-                return text, images
+    @staticmethod
+    def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
+        res = []
+        for message in messages:
+            role = message["role"]
+            content = message["content"]
+            if isinstance(content, str):
+                res.append({"role": role, "content": content})
             else:
-                raise RuntimeError("Only one image per message is supported")
-        return content, []
-
-    def _get_chat_msgs(
-        self,
-        prompt: Union[str, List[Dict]],
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
-    ):
-        content, images_chat = self._message_content_to_chat(prompt)
-
-        msgs = []
-        query_to_response: List[Dict] = []
-        images_history = []
-        for h in chat_history or []:
-            role = h["role"]
-            content_h, images_tmp = self._message_content_to_chat(h["content"])
-            if images_tmp:
-                images_history = images_tmp
-            if len(query_to_response) == 0 and role == "user":
-                query_to_response.append({"role": "user", "content": content_h})
-            if len(query_to_response) == 1 and role == "assistant":
-                query_to_response.append({"role": "assistant", "content": content_h})
-            if len(query_to_response) == 2:
-                msgs.extend(query_to_response)
-                query_to_response = []
-        image = None
-        if len(images_chat) > 0:
-            image = images_chat[0]
-        elif len(images_history) > 0:
-            image = images_history[0]
-        msgs.append({"role": "user", "content": content, "image": image})
-        return msgs
+                texts = []
+                image_urls = []
+                for c in content:
+                    c_type = c.get("type")
+                    if c_type == "text":
+                        texts.append(c["text"])
+                    else:
+                        assert (
+                            c_type == "image_url"
+                        ), "Please follow the image input of the OpenAI API."
+                        image_urls.append(c["image_url"]["url"])
+                if len(image_urls) > 1:
+                    raise RuntimeError("Only one image per message is supported")
+                image_futures = []
+                with ThreadPoolExecutor() as executor:
+                    for image_url in image_urls:
+                        fut = executor.submit(_decode_image, image_url)
+                        image_futures.append(fut)
+                images = [fut.result() for fut in image_futures]
+                assert len(images) <= 1
+                text = " ".join(texts)
+                if images:
+                    res.append({"role": role, "content": text, "image": images[0]})
+                else:
+                    res.append({"role": role, "content": text})
+        return res
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         from transformers import TextIteratorStreamer
@@ -170,7 +140,7 @@ def chat(
             generate_config = {}
 
         stream = generate_config.get("stream", False)
-        msgs = self._get_chat_msgs(prompt, chat_history)
+        msgs = self._get_processed_msgs(messages)
 
         inputs = self._tokenizer.apply_chat_template(
             msgs,
@@ -213,64 +183,38 @@ def chat(
                 response = self._tokenizer.decode(outputs[0])
                 if response.endswith(stop_str):
                     response = response[: -len(stop_str)]
-            c = Completion(
-                id=str(uuid.uuid1()),
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    CompletionChoice(
-                        index=0, text=response, finish_reason="stop", logprobs=None
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
-            )
-            return self._to_chat_completion(c)
+            return generate_chat_completion(self.model_uid, response)
 
     def chat_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
         completion_id = str(uuid.uuid1())
         for new_text in streamer:
             if not new_text.endswith(stop_str):
-                completion_choice = CompletionChoice(
-                    text=new_text, index=0, logprobs=None, finish_reason=None
-                )
-                chunk = CompletionChunk(
-                    id=completion_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=self.model_uid,
-                    choices=[completion_choice],
-                )
-                completion_usage = CompletionUsage(
+                yield generate_completion_chunk(
+                    chunk_text=new_text,
+                    finish_reason=None,
+                    chunk_id=completion_id,
+                    model_uid=self.model_uid,
                     prompt_tokens=-1,
                     completion_tokens=-1,
                     total_tokens=-1,
+                    has_choice=True,
+                    has_content=True,
                 )
-                chunk["usage"] = completion_usage
-                yield chunk
 
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
             prompt_tokens=-1,
             completion_tokens=-1,
             total_tokens=-1,
+            has_choice=True,
+            has_content=False,
         )
-        chunk["usage"] = completion_usage
-        yield chunk
 
-    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
-        msgs = self._get_chat_msgs(prompt, chat_history)
+    def _get_full_prompt(self, messages, tools):
+        msgs = self._get_processed_msgs(messages)
         inputs = self._tokenizer.apply_chat_template(
             msgs,
             add_generation_prompt=True,
diff --git a/xinference/model/llm/transformers/intern_vl.py b/xinference/model/llm/transformers/intern_vl.py
index 02632e2af8..242d4d27ac 100644
--- a/xinference/model/llm/transformers/intern_vl.py
+++ b/xinference/model/llm/transformers/intern_vl.py
@@ -12,24 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, Iterator, List, Optional, Union
 
 import torch
 
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import _decode_image
+from ..utils import (
+    _decode_image,
+    generate_chat_completion,
+    generate_completion_chunk,
+    parse_messages,
+)
 from .core import PytorchChatModel, PytorchGenerateConfig
 
 logger = logging.getLogger(__name__)
@@ -78,7 +74,7 @@ def _message_content_to_intern(content, image_cnt):
 
 def _get_prompt_and_chat_history(
     prompt: Union[str, List[Dict]],
-    chat_history: Optional[List[ChatCompletionMessage]] = None,
+    chat_history: Optional[List[Dict]] = None,
 ):
     # Convert openai history to intern vl history
     images = []
@@ -332,9 +328,7 @@ def load(self, **kwargs):
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         from ....thirdparty.internvl.conversation import get_conv_template
@@ -366,6 +360,7 @@ def chat(
             else False
         )
 
+        prompt, _, chat_history = parse_messages(messages)
         content, history, images, videos = _get_prompt_and_chat_history(
             prompt, chat_history
         )
@@ -434,10 +429,9 @@ def chat(
             chunk = self._generate_stream(generate_kwargs, input_ids, include_usage)
             return self._to_chat_completion_chunks(chunk)
         else:
-            chunk = self._generate(generate_kwargs, input_ids, template)
-            return self._to_chat_completion(chunk)
+            return self._generate(generate_kwargs, input_ids, template)
 
-    def _generate(self, generate_kwargs, input_ids, template):
+    def _generate(self, generate_kwargs, input_ids, template) -> ChatCompletion:
         prompt_tokens = len(input_ids[0])
         generation_output = self._model.generate(**generate_kwargs)
         completion_tokens = len(generation_output[0])
@@ -445,23 +439,13 @@ def _generate(self, generate_kwargs, input_ids, template):
             generation_output, skip_special_tokens=True
         )[0]
         response = response.split(template.sep)[0].strip()
-        chunk = Completion(
-            id=str(uuid.uuid1()),
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[
-                CompletionChoice(
-                    index=0, text=response, finish_reason="stop", logprobs=None
-                )
-            ],
-            usage=CompletionUsage(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=prompt_tokens + completion_tokens,
-            ),
+        return generate_chat_completion(
+            self.model_uid,
+            response,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
         )
-        return chunk
 
     def _generate_stream(self, generate_kwargs, input_ids, include_usage):
         from threading import Thread
@@ -483,58 +467,43 @@ def _generate_stream(self, generate_kwargs, input_ids, include_usage):
 
         completion_id = str(uuid.uuid1())
         prompt_tokens = len(input_ids[0])
-        completion_tokens = 0
+        total_tokens, completion_tokens = 0, 0
         # Loop through the streamer to get the new text as it is generated
         for i, new_text in enumerate(streamer):
             if new_text == self._model.conv_template.sep:
                 break
-            completion_choice = CompletionChoice(
-                text=new_text, index=0, logprobs=None, finish_reason=None
-            )
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[completion_choice],
-            )
             completion_tokens = max(completion_tokens, len(streamer.token_cache))
             total_tokens = prompt_tokens + completion_tokens
-            completion_usage = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
             )
-            chunk["usage"] = completion_usage
-            yield chunk
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
             total_tokens=total_tokens,
+            has_choice=True,
+            has_content=False,
         )
-        chunk["usage"] = completion_usage
-        yield chunk
+
         if include_usage:
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[],
-            )
-            chunk["usage"] = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                has_choice=False,
+                has_content=False,
             )
-            yield chunk
diff --git a/xinference/model/llm/transformers/internlm2.py b/xinference/model/llm/transformers/internlm2.py
index fc7b1c7588..fa046be8de 100644
--- a/xinference/model/llm/transformers/internlm2.py
+++ b/xinference/model/llm/transformers/internlm2.py
@@ -11,23 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import time
 import uuid
 from typing import Any, Dict, Iterator, List, Optional, Union
 
 from ....core.scheduler import InferenceRequest
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChoice,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-    LoRA,
-    PytorchGenerateConfig,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import generate_chat_completion, generate_completion_chunk, parse_messages
 from .core import PytorchChatModel, PytorchModelConfig
 
 
@@ -106,9 +96,7 @@ def prepare_sanitize_generate_config(self, req: InferenceRequest):
 
     def chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         kwargs: Dict[str, Any] = {}
@@ -130,6 +118,8 @@ def chat(
             if isinstance(stream_options, dict)
             else False
         )
+
+        prompt, system_prompt, chat_history = parse_messages(messages)
         if chat_history:
             input_history = [
                 (chat_history[i]["content"], (chat_history[i + 1]["content"]))
@@ -155,54 +145,42 @@ def _stream_generator():
                     total_tokens = prompt_tokens + completion_tokens
                     chunk_text = chunk_text[last_chunk_text_length:]
                     last_chunk_text_length += len(chunk_text)
-                    completion_choice = CompletionChoice(
-                        text=chunk_text, index=0, logprobs=None, finish_reason=None
-                    )
-                    yield CompletionChunk(
-                        id=chunk_id,
-                        object="text_completion",
-                        created=int(time.time()),
-                        model=self.model_uid,
-                        choices=[completion_choice],
-                        usage=CompletionUsage(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens,
-                            total_tokens=total_tokens,
-                        ),
+
+                    yield generate_completion_chunk(
+                        chunk_text,
+                        finish_reason=None,
+                        chunk_id=chunk_id,
+                        model_uid=self.model_uid,
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
                     )
+                yield generate_completion_chunk(
+                    None,
+                    finish_reason="stop",
+                    chunk_id=chunk_id,
+                    model_uid=self.model_uid,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                    has_choice=True,
+                    has_content=False,
+                )
                 if include_usage:
-                    chunk = CompletionChunk(
-                        id=chunk_id,
-                        object="text_completion",
-                        created=int(time.time()),
-                        model=self.model_uid,
-                        choices=[],
-                    )
-                    chunk["usage"] = CompletionUsage(
+                    yield generate_completion_chunk(
+                        None,
+                        finish_reason=None,
+                        chunk_id=chunk_id,
+                        model_uid=self.model_uid,
                         prompt_tokens=prompt_tokens,
                         completion_tokens=completion_tokens,
                         total_tokens=total_tokens,
+                        has_choice=False,
                     )
-                    yield chunk
 
             return self._to_chat_completion_chunks(_stream_generator())
         else:
             response, _ = self._model.chat(
                 self._tokenizer, prompt, input_history, **kwargs
             )
-            return ChatCompletion(
-                id="chat" + str(uuid.uuid1()),
-                object="chat.completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    ChatCompletionChoice(
-                        index=0,
-                        message={"role": "assistant", "content": response},
-                        finish_reason="stop",
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
-            )
+            return generate_chat_completion(self.model_uid, response)
diff --git a/xinference/model/llm/transformers/llama_2.py b/xinference/model/llm/transformers/llama_2.py
deleted file mode 100644
index 4e5e01d263..0000000000
--- a/xinference/model/llm/transformers/llama_2.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2022-2023 XProbe Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from ....types import LoRA
-from ..llm_family import LLMFamilyV1, LLMSpecV1
-from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
-
-
-class LlamaPytorchModel(PytorchModel):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model: Optional[List[LoRA]] = None,
-    ):
-        super().__init__(
-            model_uid,
-            model_family,
-            model_spec,
-            quantization,
-            model_path,
-            pytorch_model_config=pytorch_model_config,
-            peft_model=peft_model,
-        )
-
-    def _load_model(self, **kwargs):
-        model, tokenizer = super()._load_model(**kwargs)
-        # Llama has no pad token by default
-        # https://github.com/huggingface/transformers/blob/07998ef39926b76d3f6667025535d0859eed61c3/docs/source/en/llm_tutorial.md?plain=1#L125
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.eos_token_id = tokenizer.eos_token_id
-        model.config.pad_token_id = tokenizer.pad_token_id
-        return model, tokenizer
-
-    @classmethod
-    def match(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-        if llm_spec.model_format != "pytorch":
-            return False
-        model_family = llm_family.model_family or llm_family.model_name
-        if "llama-2" not in model_family:
-            return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return True
-
-
-class LlamaPytorchChatModel(PytorchChatModel):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        pytorch_model_config: Optional["PytorchModelConfig"] = None,
-        peft_model: Optional[List[LoRA]] = None,
-    ):
-        super().__init__(
-            model_uid,
-            model_family,
-            model_spec,
-            quantization,
-            model_path,
-            peft_model=peft_model,
-            pytorch_model_config=pytorch_model_config,
-        )
-        self._use_fast_tokenizer = False
-
-    def _load_model(self, **kwargs):
-        model, tokenizer = super()._load_model(**kwargs)
-        # Llama has no pad token by default
-        # https://github.com/huggingface/transformers/blob/07998ef39926b76d3f6667025535d0859eed61c3/docs/source/en/llm_tutorial.md?plain=1#L125
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.eos_token_id = tokenizer.eos_token_id
-        model.config.pad_token_id = tokenizer.pad_token_id
-        return model, tokenizer
-
-    @classmethod
-    def match(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-        if llm_spec.model_format != "pytorch":
-            return False
-        model_family = llm_family.model_family or llm_family.model_name
-        if "llama-2" not in model_family:
-            return False
-        if "chat" not in llm_family.model_ability:
-            return False
-        return True
diff --git a/xinference/model/llm/transformers/minicpmv25.py b/xinference/model/llm/transformers/minicpmv25.py
index af22319759..41b100d867 100644
--- a/xinference/model/llm/transformers/minicpmv25.py
+++ b/xinference/model/llm/transformers/minicpmv25.py
@@ -13,25 +13,21 @@
 # limitations under the License.
 import json
 import logging
-import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, Iterator, List, Optional, Union
 
 import torch
 
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import _decode_image
+from ..utils import (
+    _decode_image,
+    generate_chat_completion,
+    generate_completion_chunk,
+    parse_messages,
+)
 from .core import PytorchChatModel, PytorchGenerateConfig
 
 logger = logging.getLogger(__name__)
@@ -125,12 +121,11 @@ def _message_content_to_chat(self, content):
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         stream = generate_config.get("stream", False) if generate_config else False
+        prompt, _, chat_history = parse_messages(messages)
         content, images_chat = self._message_content_to_chat(prompt)
 
         msgs = []
@@ -166,57 +161,29 @@ def chat(
             it = self.chat_stream(chat)
             return self._to_chat_completion_chunks(it)
         else:
-            c = Completion(
-                id=str(uuid.uuid1()),
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    CompletionChoice(
-                        index=0, text=chat, finish_reason="stop", logprobs=None
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
-            )
-            return self._to_chat_completion(c)
+            return generate_chat_completion(self.model_uid, chat)
 
     def chat_stream(self, chat) -> Iterator[CompletionChunk]:
         completion_id = str(uuid.uuid1())
         for new_text in chat:
-            completion_choice = CompletionChoice(
-                text=new_text, index=0, logprobs=None, finish_reason=None
-            )
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[completion_choice],
-            )
-            completion_usage = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=-1,
                 completion_tokens=-1,
                 total_tokens=-1,
             )
-            chunk["usage"] = completion_usage
-            yield chunk
 
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
             prompt_tokens=-1,
             completion_tokens=-1,
             total_tokens=-1,
+            has_choice=True,
+            has_content=False,
         )
-        chunk["usage"] = completion_usage
-        yield chunk
diff --git a/xinference/model/llm/transformers/minicpmv26.py b/xinference/model/llm/transformers/minicpmv26.py
index 0900bc4a86..7e97bca4f0 100644
--- a/xinference/model/llm/transformers/minicpmv26.py
+++ b/xinference/model/llm/transformers/minicpmv26.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, Iterator, List, Optional, Union
@@ -20,18 +19,15 @@
 import torch
 from PIL import Image
 
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import _decode_image
+from ..utils import (
+    _decode_image,
+    generate_chat_completion,
+    generate_completion_chunk,
+    parse_messages,
+)
 from .core import PytorchChatModel, PytorchGenerateConfig
 
 logger = logging.getLogger(__name__)
@@ -160,13 +156,12 @@ def _load_video(_url):
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         stream = generate_config.get("stream", False) if generate_config else False
         videoExisted = False
+        prompt, _, chat_history = parse_messages(messages)
 
         content, images_chat, video_frames = self._message_content_to_chat(prompt)
         if len(video_frames) > 0:
@@ -216,57 +211,28 @@ def chat(
             it = self.chat_stream(chat)
             return self._to_chat_completion_chunks(it)
         else:
-            c = Completion(
-                id=str(uuid.uuid1()),
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[
-                    CompletionChoice(
-                        index=0, text=chat, finish_reason="stop", logprobs=None
-                    )
-                ],
-                usage=CompletionUsage(
-                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-                ),
-            )
-            return self._to_chat_completion(c)
+            return generate_chat_completion(self.model_uid, chat)
 
     def chat_stream(self, chat) -> Iterator[CompletionChunk]:
         completion_id = str(uuid.uuid1())
         for new_text in chat:
-            completion_choice = CompletionChoice(
-                text=new_text, index=0, logprobs=None, finish_reason=None
-            )
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[completion_choice],
-            )
-            completion_usage = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=-1,
                 completion_tokens=-1,
                 total_tokens=-1,
             )
-            chunk["usage"] = completion_usage
-            yield chunk
-
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
             prompt_tokens=-1,
             completion_tokens=-1,
             total_tokens=-1,
+            has_choice=True,
+            has_content=False,
         )
-        chunk["usage"] = completion_usage
-        yield chunk
diff --git a/xinference/model/llm/transformers/omnilmm.py b/xinference/model/llm/transformers/omnilmm.py
index 583f3cc56e..3ddffda0a4 100644
--- a/xinference/model/llm/transformers/omnilmm.py
+++ b/xinference/model/llm/transformers/omnilmm.py
@@ -16,20 +16,13 @@
 import logging
 import operator
 import tempfile
-import time
-import uuid
 from typing import Dict, Iterator, List, Optional, Tuple, Union
 
 from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChoice,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import generate_chat_completion, parse_messages
 from .core import PytorchChatModel, PytorchGenerateConfig
 
 logger = logging.getLogger(__name__)
@@ -96,15 +89,14 @@ def _ensure_url(_url):
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         if generate_config and generate_config.get("stream"):
             raise Exception(
                 f"Chat with model {self.model_family.model_name} does not support stream."
             )
+        prompt, _, chat_history = parse_messages(messages)
         image_first, prompt = self._message_content_to_OmniLMM(prompt)
 
         msgs = []
@@ -135,19 +127,4 @@ def chat(
         input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
         answer = self._model.chat(input=input)
 
-        return ChatCompletion(
-            id="chat" + str(uuid.uuid1()),
-            object="chat.completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[
-                ChatCompletionChoice(
-                    index=0,
-                    message={"role": "assistant", "content": answer},
-                    finish_reason="stop",
-                )
-            ],
-            usage=CompletionUsage(
-                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-            ),
-        )
+        return generate_chat_completion(self.model_uid, answer)
diff --git a/xinference/model/llm/transformers/qwen_vl.py b/xinference/model/llm/transformers/qwen_vl.py
index 8a2be562e3..e7db57334c 100644
--- a/xinference/model/llm/transformers/qwen_vl.py
+++ b/xinference/model/llm/transformers/qwen_vl.py
@@ -15,7 +15,6 @@
 import logging
 import operator
 import tempfile
-import time
 import typing
 import uuid
 from typing import Dict, Iterator, List, Optional, Tuple, Union
@@ -25,16 +24,9 @@
 
 from ....core.scheduler import InferenceRequest
 from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
 from .utils import pad_prefill_tokens
 
@@ -129,18 +121,12 @@ def _ensure_url(_url):
             return self._tokenizer.from_list_format(content)
         return content
 
-    def _get_prompt_and_chat_history(
-        self,
-        prompt: Union[str, List[Dict]],
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
-    ):
-        prompt = self._message_content_to_qwen(prompt)
-        # Convert openai history to qwen vl history
+    def _get_prompt_and_chat_history(self, messages: List[Dict]):
         qwen_history = []
         query_to_response: List = []
-        for h in chat_history or []:
-            role = h["role"]
-            content = self._message_content_to_qwen(h["content"])
+        for message in messages[:-1]:
+            role = message["role"]
+            content = self._message_content_to_qwen(message["content"])
             if len(query_to_response) == 0 and role == "user":
                 query_to_response.append(content)
             if len(query_to_response) == 1 and role == "assistant":
@@ -148,18 +134,15 @@ def _get_prompt_and_chat_history(
             if len(query_to_response) == 2:
                 qwen_history.append(query_to_response)
                 query_to_response = []
+        prompt = self._message_content_to_qwen(messages[-1]["content"])
         return prompt, qwen_history
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        prompt, qwen_history = self._get_prompt_and_chat_history(
-            prompt, chat_history=chat_history
-        )
+        prompt, qwen_history = self._get_prompt_and_chat_history(messages)
 
         stream = generate_config.get("stream", False) if generate_config else False
         stream_options = (
@@ -174,33 +157,17 @@ def chat(
             it = self._generate_stream(prompt, qwen_history, include_usage)  # type: ignore
             return self._to_chat_completion_chunks(it)
         else:
-            c = self._generate(prompt, qwen_history)  # type: ignore
-            return self._to_chat_completion(c)
+            return self._generate(prompt, qwen_history)  # type: ignore
 
-    def _generate(self, prompt: str, qwen_history: List) -> Completion:
+    def _generate(self, prompt: str, qwen_history: List) -> ChatCompletion:
         response, history = self._model.chat(
             self._tokenizer, query=prompt, history=qwen_history
         )
-        c = Completion(
-            id=str(uuid.uuid1()),
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[
-                CompletionChoice(
-                    index=0, text=response, finish_reason="stop", logprobs=None
-                )
-            ],
-            usage=CompletionUsage(
-                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-            ),
-        )
-        return c
+        return generate_chat_completion(self.model_uid, response)
 
     def _generate_stream(
         self, prompt: str, qwen_history: List, include_usage
     ) -> Iterator[CompletionChunk]:
-        # response, history = model.chat(tokenizer, message, history=history)
         response_generator = self._model.chat_stream(
             self._tokenizer, query=prompt, history=qwen_history
         )
@@ -212,57 +179,40 @@ def _generate_stream(
         for response in response_generator:
             inc_content = response[len(full_response) :]
             full_response = response
-            completion_choice = CompletionChoice(
-                text=inc_content, index=0, logprobs=None, finish_reason=None
-            )
-            completion_chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[completion_choice],
-            )
             completion_tokens = completion_tokens + 1
             total_tokens = prompt_tokens + completion_tokens
-            completion_usage = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=inc_content,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
             )
-            completion_chunk["usage"] = completion_usage
-            yield completion_chunk
-
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        completion_chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
             total_tokens=total_tokens,
+            has_choice=True,
+            has_content=False,
         )
-        completion_chunk["usage"] = completion_usage
-        yield completion_chunk
         if include_usage:
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[],
-            )
-            chunk["usage"] = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                has_choice=False,
+                has_content=False,
             )
-            yield chunk
 
     @staticmethod
     def get_batch_size_and_seq_len_indexes_from_kv() -> Tuple[int, int]:
@@ -359,10 +309,8 @@ def _tokenize_str(role, content):
 
         return raw_text, context_tokens
 
-    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
-        prompt, qwen_history = self._get_prompt_and_chat_history(
-            prompt, chat_history=chat_history
-        )
+    def _get_full_prompt(self, messages: List[Dict], tools):
+        prompt, qwen_history = self._get_prompt_and_chat_history(messages)
         _, context_tokens = self.make_context(self._tokenizer, prompt, qwen_history)
         return context_tokens
 
diff --git a/xinference/model/llm/transformers/tests/test_tensorizer.py b/xinference/model/llm/transformers/tests/test_tensorizer.py
index a4e228259c..87fd38a7a7 100644
--- a/xinference/model/llm/transformers/tests/test_tensorizer.py
+++ b/xinference/model/llm/transformers/tests/test_tensorizer.py
@@ -37,7 +37,9 @@ def setup_and_teardown(self):
             model_lang=["en", "zh"],
             model_ability=["chat", "tools"],
             model_specs=[spec],
-            prompt_style=None,
+            chat_template=None,
+            stop_token_ids=None,
+            stop=None,
         )
 
         if not os.path.exists(self.model_path):
diff --git a/xinference/model/llm/transformers/utils.py b/xinference/model/llm/transformers/utils.py
index 5ada9a512c..d34112d24f 100644
--- a/xinference/model/llm/transformers/utils.py
+++ b/xinference/model/llm/transformers/utils.py
@@ -321,7 +321,7 @@ def generate_stream(
 
     if stream:
         completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason=finish_reason
+            index=0, logprobs=None, finish_reason=finish_reason
         )
     else:
         completion_choice = CompletionChoice(
@@ -430,39 +430,6 @@ def pad_prefill_tokens(
     return prompt_tokens
 
 
-def _get_completion_chunk(
-    output: str,
-    chunk_id: str,
-    finish_reason: Optional[str],
-    model_uid: str,
-    r: InferenceRequest,
-    just_usage: bool,
-):
-    completion_choice = (
-        [
-            CompletionChoice(
-                text=output, index=0, logprobs=None, finish_reason=finish_reason
-            )
-        ]
-        if not just_usage
-        else []
-    )
-    completion_chunk = CompletionChunk(
-        id=chunk_id,
-        object="text_completion",
-        created=int(time.time()),
-        model=model_uid,
-        choices=completion_choice,
-    )
-    completion_usage = CompletionUsage(
-        prompt_tokens=len(r.prompt_tokens),
-        completion_tokens=len(r.new_tokens),
-        total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
-    )
-    completion_chunk["usage"] = completion_usage
-    return completion_chunk
-
-
 def _get_completion(
     output: str,
     chunk_id: str,
@@ -551,6 +518,8 @@ def _batch_inference_one_step_internal(
     bos_flag: str = "<bos_stream>",
     eos_flag: str = "<eos_stream>",
 ):
+    from ..utils import generate_completion_chunk
+
     # need to judge stopped here,
     # since some requests state may change to stopped due to invalid parameters, e.g. max_src_len
     valid_req_list = [r for r in req_list if not r.stopped]
@@ -710,11 +679,30 @@ def _batch_inference_one_step_internal(
                     output = output[r.last_output_length :]
                     r.last_output_length += len(output)
 
-                    completion_chunk = _get_completion_chunk(
-                        output, r.chunk_id, r.finish_reason, model_uid, r, False
+                    completion_chunk = generate_completion_chunk(
+                        chunk_text=output,
+                        finish_reason=None,
+                        chunk_id=r.chunk_id,
+                        model_uid=model_uid,
+                        prompt_tokens=len(r.prompt_tokens),
+                        completion_tokens=len(r.new_tokens),
+                        total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
                     )
                     r.completion.append(completion_chunk)
                     if r.stopped:
+                        # OpenAI compatible chunk
+                        completion_chunk = generate_completion_chunk(
+                            chunk_text=None,
+                            finish_reason=r.finish_reason,
+                            chunk_id=r.chunk_id,
+                            model_uid=model_uid,
+                            prompt_tokens=len(r.prompt_tokens),
+                            completion_tokens=len(r.new_tokens),
+                            total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
+                            has_choice=True,
+                            has_content=False,
+                        )
+                        r.completion.append(completion_chunk)
                         r.completion.append(eos_flag)
 
                     # last round, handle stream result
@@ -723,8 +711,16 @@ def _batch_inference_one_step_internal(
                     # these tokens are real generated and should be counted.
                     if r.stopped and _i == decode_round - 1 and include_usage:
                         r.completion.append(
-                            _get_completion_chunk(
-                                "", r.chunk_id, r.finish_reason, model_uid, r, True
+                            generate_completion_chunk(
+                                chunk_text=None,
+                                finish_reason=None,
+                                chunk_id=r.chunk_id,
+                                model_uid=model_uid,
+                                prompt_tokens=len(r.prompt_tokens),
+                                completion_tokens=len(r.new_tokens),
+                                total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
+                                has_choice=False,
+                                has_content=False,
                             )
                         )
             else:
diff --git a/xinference/model/llm/transformers/yi_vl.py b/xinference/model/llm/transformers/yi_vl.py
index e4b3d1f6ce..9cfa87a536 100644
--- a/xinference/model/llm/transformers/yi_vl.py
+++ b/xinference/model/llm/transformers/yi_vl.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from threading import Thread
@@ -21,17 +20,14 @@
 import torch
 
 from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChoice,
-    CompletionChunk,
-    CompletionUsage,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import _decode_image
+from ..utils import (
+    _decode_image,
+    generate_chat_completion,
+    generate_completion_chunk,
+    parse_messages,
+)
 from .core import PytorchChatModel, PytorchGenerateConfig
 
 logger = logging.getLogger(__name__)
@@ -105,15 +101,11 @@ def _message_content_to_yi(content) -> Union[str, tuple]:
 
     def chat(
         self,
-        prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         from transformers import TextIteratorStreamer
 
-        # TODO(codingl2k1): implement stream mode.
-
         if not generate_config:
             generate_config = {}
 
@@ -134,7 +126,8 @@ def chat(
 
         # Convert chat history to llava state
         state = conv_templates["mm_default"].copy()
-        for message in chat_history or []:
+        prompt, _, chat_history = parse_messages(messages)
+        for message in chat_history:
             content = self._message_content_to_yi(message["content"])
             state.append_message(message["role"], content)
         state.append_message(state.roles[0], self._message_content_to_yi(prompt))
@@ -190,31 +183,15 @@ def chat(
             it = self._generate_stream(streamer, stop_str, input_ids, include_usage)
             return self._to_chat_completion_chunks(it)
         else:
-            c = self._generate(streamer, stop_str)
-            return self._to_chat_completion(c)
+            return self._generate(streamer, stop_str)
 
-    def _generate(self, streamer, stop_str) -> Completion:
+    def _generate(self, streamer, stop_str) -> ChatCompletion:
         generated_text = ""
         for new_text in streamer:
             generated_text += new_text
             if generated_text.endswith(stop_str):
                 generated_text = generated_text[: -len(stop_str)]
-
-        c = Completion(
-            id=str(uuid.uuid1()),
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[
-                CompletionChoice(
-                    index=0, text=generated_text, finish_reason="stop", logprobs=None
-                )
-            ],
-            usage=CompletionUsage(
-                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
-            ),
-        )
-        return c
+        return generate_chat_completion(self.model_uid, generated_text)
 
     def _generate_stream(
         self, streamer, stop_str, input_ids, include_usage
@@ -224,54 +201,37 @@ def _generate_stream(
         prompt_tokens = len(input_ids[0])
         for i, new_text in enumerate(streamer):
             if not new_text.endswith(stop_str):
-                completion_choice = CompletionChoice(
-                    text=new_text, index=0, logprobs=None, finish_reason=None
-                )
-                chunk = CompletionChunk(
-                    id=completion_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=self.model_uid,
-                    choices=[completion_choice],
-                )
                 completion_tokens = i
                 total_tokens = prompt_tokens + completion_tokens
-                completion_usage = CompletionUsage(
+                yield generate_completion_chunk(
+                    chunk_text=new_text,
+                    finish_reason=None,
+                    chunk_id=completion_id,
+                    model_uid=self.model_uid,
                     prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
                 )
-                chunk["usage"] = completion_usage
-                yield chunk
-
-        completion_choice = CompletionChoice(
-            text="", index=0, logprobs=None, finish_reason="stop"
-        )
-        chunk = CompletionChunk(
-            id=completion_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=self.model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
             total_tokens=total_tokens,
+            has_choice=True,
+            has_content=False,
         )
-        chunk["usage"] = completion_usage
-        yield chunk
         if include_usage:
-            chunk = CompletionChunk(
-                id=completion_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=self.model_uid,
-                choices=[],
-            )
-            chunk["usage"] = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                has_choice=False,
+                has_content=False,
             )
-            yield chunk
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 8107d890a0..974671720e 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -17,6 +17,7 @@
 import logging
 import os
 import time
+import typing
 import uuid
 from io import BytesIO
 from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast
@@ -25,19 +26,19 @@
 from PIL import Image
 
 from ...types import (
-    SPECIAL_TOOL_PROMPT,
     ChatCompletion,
+    ChatCompletionChoice,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
+    CompletionChoice,
     CompletionChunk,
+    CompletionUsage,
 )
 from ..utils import ensure_cache_cleared
 from .llm_family import (
     LlamaCppLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
-    PromptStyleV1,
     _get_cache_dir,
     get_cache_status,
 )
@@ -46,7 +47,6 @@
 
 
 QWEN_TOOL_CALL_FAMILY = [
-    "qwen-chat",
     "qwen1.5-chat",
     "qwen1.5-moe-chat",
     "qwen2-instruct",
@@ -58,416 +58,90 @@
     "glm4-chat-1m",
 ]
 
+QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
+
 
 class ChatModelMixin:
     @staticmethod
-    def get_prompt(
-        prompt: str,
-        chat_history: List[ChatCompletionMessage],
-        prompt_style: PromptStyleV1,
-        tools: Optional[List[Dict]] = None,
-    ):
+    @functools.lru_cache
+    def _compile_jinja_template(chat_template):
         """
-        Inspired by FastChat. Format chat history into a prompt according to the prompty style of
-        different models.
+        Copied from transformers source code.
         """
-        assert prompt_style.roles is not None
-        if prompt != SPECIAL_TOOL_PROMPT:
-            chat_history.append(
-                ChatCompletionMessage(role=prompt_style.roles[0], content=prompt)
-            )
-        chat_history.append(
-            ChatCompletionMessage(role=prompt_style.roles[1], content="")
+        try:
+            from jinja2.exceptions import TemplateError
+            from jinja2.sandbox import ImmutableSandboxedEnvironment
+        except ImportError:
+            raise ImportError("xinference requires jinja2 to be installed.")
+
+        def raise_exception(message):
+            raise TemplateError(message)
+
+        jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+        jinja_env.globals["raise_exception"] = raise_exception
+        return jinja_env.from_string(chat_template)
+
+    def _build_from_raw_template(
+        self, messages: List, chat_template: str, **kwargs
+    ) -> str:
+        compiled_template = self._compile_jinja_template(chat_template)
+        rendered = compiled_template.render(
+            messages=messages, add_generation_prompt=True, **kwargs
         )
-
-        def get_role(role_name: str):
-            if role_name == "user":
-                return prompt_style.roles[0]
-            elif role_name == "assistant":
-                return prompt_style.roles[1]
-            else:
-                return role_name
-
-        if prompt_style.style_name == "ADD_COLON_SINGLE":
-            ret = prompt_style.system_prompt + prompt_style.intra_message_sep
-            for message in chat_history:
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += role + ": " + content + prompt_style.intra_message_sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif prompt_style.style_name == "NO_COLON_TWO":
-            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
-            ret = prompt_style.system_prompt
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += role + content + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif prompt_style.style_name == "LLAMA2":
-            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
-            ret = ""
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    if i == 0:
-                        ret += prompt_style.system_prompt + content
-                    else:
-                        ret += role + " " + content + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif prompt_style.style_name == "LLAMA3":
-            ret = (
-                f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
-                f"{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
-            )
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += (
-                        f"<|start_header_id|>{role}<|end_header_id|>"
-                        f"{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
-                    )
-                else:
-                    ret += f"<|start_header_id|>{role}<|end_header_id|>{prompt_style.intra_message_sep}"
-            return ret
-        elif prompt_style.style_name == "MIXTRAL_V01":
-            ret = ""
-            for i, message in enumerate(chat_history):
-                content = message["content"]
-                if i % 2 == 0:  # user
-                    ret += f"<s> [INST] {content} [/INST]"
-                else:  # assistant
-                    ret += f"{content} </s>"
-            return ret
-        elif prompt_style.style_name == "CHATGLM3":
-            prompts = (
-                [f"<|system|>\n {prompt_style.system_prompt}"]
-                if prompt_style.system_prompt
-                else []
-            )
-
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message.get("content")
-                tool_calls = message.get("tool_calls")
-                if tool_calls:
-                    content = tool_calls[0]["function"]
-                if content:
-                    if role == "tool":
-                        role = "observation"
-                    prompts.append(f"<|{role}|>\n {content}")
-                else:
-                    prompts.append(f"<|{role}|>")
-            return "\n".join(prompts)
-        elif prompt_style.style_name == "XVERSE":
-            ret = (
-                f"<|system|> \n {prompt_style.system_prompt}"
-                if prompt_style.system_prompt
-                else ""
-            )
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += f"<|{role}|> \n {content}"
-                else:
-                    ret += f"<|{role}|>"
-            return ret
-        elif prompt_style.style_name == "QWEN":
-            if tools:
-                tool_desc = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters} Format the arguments as a JSON object."""
-
-                react_instruction = """Answer the following questions as best you can. You have access to the following APIs:
-
-{tools_text}
-
-Use the following format:
-
-Question: the input question you must answer
-Thought: you should always think about what to do
-Action: the action to take, should be one of [{tools_name_text}]
-Action Input: the input to the action
-Observation: the result of the action
-... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
-Thought: I now know the final answer
-Final Answer: the final answer to the original input question
-
-Begin!"""
-                tools_text = []
-                tools_name_text = []
-                for func_info in tools:
-                    parameters = []
-                    fp = func_info["function"].get("parameters", {})
-                    if fp:
-                        required_parameters = fp.get("required", [])
-                        for name, p in fp["properties"].items():
-                            param = dict({"name": name}, **p)
-                            if name in required_parameters:
-                                param["required"] = True
-                            parameters.append(param)
-
-                    name = func_info["function"]["name"]
-                    desc = func_info["function"]["description"]
-                    tool_string = tool_desc.format(
-                        name_for_model=name,
-                        name_for_human=name,
-                        # Hint: You can add the following format requirements in description:
-                        #   "Format the arguments as a JSON object."
-                        #   "Enclose the code within triple backticks (`) at the beginning and end of the code."
-                        description_for_model=desc,
-                        parameters=json.dumps(parameters, ensure_ascii=False),
-                    )
-                    tools_text.append(tool_string)
-                    tools_name_text.append(name)
-                tools_text_string = "\n\n".join(tools_text)
-                tools_name_text_string = ", ".join(tools_name_text)
-                tool_system = react_instruction.format(
-                    tools_text=tools_text_string,
-                    tools_name_text=tools_name_text_string,
+        return rendered
+
+    def get_full_context(
+        self, messages: List, chat_template: str, tokenizer=None, **kwargs
+    ) -> str:
+        if tokenizer is not None:
+            try:
+                full_context = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    chat_template=chat_template,
+                    add_generation_prompt=True,
+                    **kwargs,
                 )
-            else:
-                tool_system = ""
-
-            ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
-            for message in chat_history:
-                role = get_role(message["role"])
-                content = message.get("content")
-
-                ret += prompt_style.intra_message_sep
-                if tools:
-                    if role == "user":
-                        if tool_system:
-                            content = tool_system + f"\n\nQuestion: {content}"
-                            tool_system = ""
-                        else:
-                            content = f"Question: {content}"
-                    elif role == "assistant":
-                        tool_calls = message.get("tool_calls")
-                        if tool_calls:
-                            func_call = tool_calls[0]["function"]
-                            f_name, f_args = (
-                                func_call["name"],
-                                func_call["arguments"],
-                            )
-                            content = f"Thought: I can use {f_name}.\nAction: {f_name}\nAction Input: {f_args}"
-                        elif content:
-                            content = f"Thought: I now know the final answer.\nFinal answer: {content}"
-                    elif role == "tool":
-                        role = "function"
-                        content = f"Observation: {content}"
-                    else:
-                        raise Exception(f"Unsupported message role: {role}")
-                if content:
-                    content = content.lstrip("\n").rstrip()
-                    ret += f"<|im_start|>{role}\n{content}<|im_end|>"
-                else:
-                    ret += f"<|im_start|>{role}\n"
-            return ret
-        elif prompt_style.style_name == "CHATML":
-            ret = (
-                ""
-                if prompt_style.system_prompt == ""
-                else prompt_style.system_prompt + prompt_style.intra_message_sep + "\n"
-            )
-            for message in chat_history:
-                role = get_role(message["role"])
-                content = message["content"]
+                return full_context
+            except Exception as e:
+                logger.warning(
+                    f"tokenizer.apply_chat_template error. Maybe this is an old model: {e}"
+                )
+                return self._build_from_raw_template(messages, chat_template, **kwargs)
+        else:
+            # build from jinja
+            # Compilation function uses a cache to avoid recompiling the same template
+            return self._build_from_raw_template(messages, chat_template, **kwargs)
 
-                if content:
-                    ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
-                else:
-                    ret += role + "\n"
-            return ret
-        elif prompt_style.style_name == "INTERNLM2":
-            ret = (
-                "<s>"
-                if prompt_style.system_prompt == ""
-                else "<s><|im_start|>system\n"
-                + prompt_style.system_prompt
-                + prompt_style.intra_message_sep
-                + "\n"
-            )
-            for message in chat_history:
-                role = get_role(message["role"])
-                content = message["content"]
+    @staticmethod
+    def get_specific_prompt(model_family: str, messages: List[Dict]):
+        """
+        Inspired by FastChat. Format chat history into a prompt according to the prompty style of
+        different models.
+        """
+        _messages = [x for x in messages]  # copy for not modifying the origin messages
+        _messages.append({"role": "assistant", "content": ""})
 
-                if content:
-                    ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
-                else:
-                    ret += role + "\n"
-            return ret
-        elif prompt_style.style_name == "ADD_COLON_SINGLE_COT":
-            ret = prompt_style.system_prompt + prompt_style.intra_message_sep
-            for message in chat_history:
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += role + ": " + content + prompt_style.intra_message_sep
-                else:
-                    ret += role + ": Let's think step by step."
-            return ret
-        elif prompt_style.style_name == "DEEPSEEK_CHAT":
-            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
-            ret = prompt_style.system_prompt
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += role + ": " + content + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        elif prompt_style.style_name == "DEEPSEEK_CODER":
-            sep = prompt_style.inter_message_sep
-            ret = prompt_style.system_prompt + sep
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += role + "\n" + content + sep
-                else:
-                    ret += role + "\n"
-            return ret
-        elif prompt_style.style_name == "GORILLA_OPENFUNCTIONS":
-            if tools:
-                gorilla_functions = []
-                for tool in tools:
-                    gorilla_functions.append(
-                        {
-                            "name": tool["function"]["name"],
-                            "api_name": tool["function"]["name"],
-                            "description": tool["function"]["description"],
-                            "parameters": [
-                                dict({"name": name}, **p)
-                                for name, p in tool["function"]["parameters"][
-                                    "properties"
-                                ].items()
-                            ],
-                        }
-                    )
-                tools_string = json.dumps(gorilla_functions)
-                return f"USER: <<question>> {prompt} <<function>> {tools_string}\nASSISTANT: "
-            else:
-                return f"USER: <<question>> {prompt}\nASSISTANT: "
-        elif prompt_style.style_name == "orion":
-            ret = "<s>"
-            for i, message in enumerate(chat_history):
-                content = message["content"]
-                role = get_role(message["role"])
-                if i % 2 == 0:  # Human
-                    assert content is not None
-                    ret += role + ": " + content + "\n\n"
-                else:  # Assistant
-                    if content:
-                        ret += role + ": </s>" + content + "</s>"
-                    else:
-                        ret += role + ": </s>"
-            return ret
-        elif prompt_style.style_name == "gemma":
-            ret = ""
-            for message in chat_history:
-                content = message["content"]
-                role = get_role(message["role"])
-                ret += "<start_of_turn>" + role + "\n"
-                if content:
-                    ret += content + "<end_of_turn>\n"
-            return ret
-        elif prompt_style.style_name == "CodeShell":
-            ret = ""
-            for message in chat_history:
-                content = message["content"]
-                role = get_role(message["role"])
-                if content:
-                    ret += f"{role}{content}|<end>|"
-                else:
-                    ret += f"{role}".rstrip()
-            return ret
-        elif prompt_style.style_name == "MINICPM-2B":
-            ret = ""
-            for message in chat_history:
-                content = message["content"] or ""
-                role = get_role(message["role"])
-                if role == "user":
-                    ret += "<用户>" + content.strip()
-                else:
-                    ret += "<AI>" + content.strip()
-            return ret
-        elif prompt_style.style_name == "PHI3":
-            ret = f"<|system|>{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
-            for message in chat_history:
-                content = message["content"] or ""
-                role = get_role(message["role"])
-                if content:
-                    ret += f"<|{role}|>{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
-                else:
-                    ret += f"<|{role}|>{prompt_style.intra_message_sep}"
-            ret += "<|assistant|>\n"
-            return ret
-        elif prompt_style.style_name == "c4ai-command-r":
-            ret = (
-                f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>"
-                f"{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
+        if model_family == "internvl2":
+            system_prompt = (
+                messages[0]["content"] if messages[0]["role"] == "system" else ""
             )
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += f"{role}{content}{prompt_style.inter_message_sep}"
-                else:
-                    ret += role
-            return ret
-        elif prompt_style.style_name == "mistral-nemo":
-            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
-            ret = "<s>"
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    if i == len(chat_history) - 2 and prompt_style.system_prompt:
-                        ret += (
-                            role
-                            + " "
-                            + prompt_style.system_prompt
-                            + "\n\n"
-                            + content
-                            + seps[i % 2]
-                        )
-                    else:
-                        ret += role + " " + content + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif prompt_style.style_name == "INTERNVL":
+            intra_message_sep = "<|im_end|>"
             ret = (
                 "<s>"
-                if prompt_style.system_prompt == ""
+                if system_prompt == ""
                 else "<s><|im_start|>system\n"
-                + prompt_style.system_prompt
-                + prompt_style.intra_message_sep
+                + system_prompt
+                + intra_message_sep
                 + "\n"
             )
             images = []  # type: ignore
-            for message in chat_history:
-                role = get_role(message["role"])
+            for message in _messages:
+                role = "<|im_start|>" + message["role"]
                 content = message["content"]
                 if isinstance(content, str):
                     if content:
-                        ret += (
-                            role
-                            + "\n"
-                            + content
-                            + prompt_style.intra_message_sep
-                            + "\n"
-                        )
+                        ret += role + "\n" + content + intra_message_sep + "\n"
                     else:
                         ret += role + "\n"
                 elif isinstance(content, list):
@@ -488,21 +162,15 @@ def get_role(role_name: str):
                             image_futures.append(fut)
                     images = [fut.result() for fut in image_futures]
                     if len(image_futures) == 0:
-                        ret += (
-                            role + "\n" + text + prompt_style.intra_message_sep + "\n"
-                        )
+                        ret += role + "\n" + text + intra_message_sep + "\n"
                     else:
                         ret += (
-                            role
-                            + "\n"
-                            + f"<image>\n{text}"
-                            + prompt_style.intra_message_sep
-                            + "\n"
+                            role + "\n" + f"<image>\n{text}" + intra_message_sep + "\n"
                         )
 
-            return (ret, images)
+            return ret, images
         else:
-            raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
+            raise ValueError(f"Invalid model family: {model_family}")
 
     @classmethod
     def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
@@ -523,7 +191,7 @@ def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChun
                 {
                     "index": i,
                     "delta": {
-                        "content": choice.get("text"),
+                        **({"content": choice["text"]} if "text" in choice else {}),
                         **(
                             {"tool_calls": choice["tool_calls"]}
                             if "tool_calls" in choice
@@ -632,83 +300,39 @@ def _to_chat_completion(completion: Completion) -> ChatCompletion:
         }
 
     @staticmethod
-    def _eval_gorilla_openfunctions_arguments(c, tools):
-        tool_names = [tool["function"]["name"] for tool in tools]
-        arguments = c["choices"][0]["text"]
-
-        def tool_call(n, **kwargs):
-            return None, n, kwargs
-
+    def _eval_glm_chat_arguments(c):
         try:
-            a, b, c = eval(
-                arguments, {n: functools.partial(tool_call, n) for n in tool_names}
-            )
-            return a, b, c
-        except Exception as e:
-            logger.error("Eval tool calls completion failed: %s", e)
-            return arguments, None, None
-
-    @staticmethod
-    def _eval_glm_chat_arguments(c, tools):
-        try:
-            if isinstance(c[0], str):
-                return c[0], None, None
-            return None, c[0]["name"], c[0]["parameters"]
+            if isinstance(c, dict):
+                return None, c["name"], c["arguments"]
         except KeyError:
             logger.error("Can't parse glm output: %s", c)
             return str(c), None, None
+        else:
+            return str(c), None, None
 
     @staticmethod
-    def _eval_qwen_chat_arguments(c, tools):
+    def _eval_qwen_chat_arguments(c):
         text = c["choices"][0]["text"]
+        text: str = text.strip()
+        if text.startswith(QWEN_TOOL_CALL_SYMBOLS[0]):
+            text = text[len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
+        if text.endswith(QWEN_TOOL_CALL_SYMBOLS[1]):
+            text = text[: -len(QWEN_TOOL_CALL_SYMBOLS[1])]
+        text = text.strip()
         try:
-            # Refer to:
-            # https://github.com/QwenLM/Qwen/blob/main/examples/react_prompt.md
-            # https://github.com/QwenLM/Qwen/blob/main/openai_api.py#L297
-            func_name, func_args, content = "", "", ""
-            i = text.rfind("\nAction:")
-            j = text.rfind("\nAction Input:")
-            k = text.rfind("\nObservation:")
-            t = max(
-                text.rfind("\nThought:", 0, i), text.rfind("Thought:", 0, i)
-            )  # find the last thought just before Action, considering the Thought at the very beginning
-            if 0 <= i < j:  # If the text has `Action` and `Action input`,
-                if k < j:  # but does not contain `Observation`,
-                    # then it is likely that `Observation` is omitted by the LLM,
-                    # because the output text may have discarded the stop word.
-                    text = text.rstrip() + "\nObservation:"  # Add it back.
-                    k = text.rfind("\nObservation:")
-            if 0 <= t < i < j < k:
-                func_name = text[i + len("\nAction:") : j].strip()
-                func_args = text[j + len("\nAction Input:") : k].strip()
-                content = text[
-                    t + len("\nThought:") : i
-                ].strip()  # len("\nThought:") and len("Thought:") both are OK since there is a space after :
-            if func_name:
-                return content, func_name, json.loads(func_args)
+            content = json.loads(text)
+            return None, content["name"], content["arguments"]
         except Exception as e:
-            logger.error("Eval tool calls completion failed: %s", e)
-        t = max(text.rfind("\nThought:"), text.rfind("Thought:"))
-        z = max(text.rfind("\nFinal Answer:"), text.rfind("Final Answer:"))
-        if z >= 0:
-            text = text[
-                z + len("\nFinal Answer:") :
-            ]  # len("\nFinal Answer::") and len("Final Answer::") both are OK since there is a space after :
-        else:
-            text = text[
-                t + len("\nThought:") :
-            ]  # There is only Thought: no Final Answer:
-        return text, None, None
+            logger.error("Can't parse qwen tool call output: %s. Error: %s", text, e)
+            return text, None, None
 
     @classmethod
-    def _eval_tool_arguments(cls, model_family, c, tools):
+    def _eval_tool_arguments(cls, model_family, c):
         family = model_family.model_family or model_family.model_name
-        if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
-            content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
-        elif family in GLM4_TOOL_CALL_FAMILY:
-            content, func, args = cls._eval_glm_chat_arguments(c, tools)
+        if family in GLM4_TOOL_CALL_FAMILY:
+            content, func, args = cls._eval_glm_chat_arguments(c)
         elif family in QWEN_TOOL_CALL_FAMILY:
-            content, func, args = cls._eval_qwen_chat_arguments(c, tools)
+            content, func, args = cls._eval_qwen_chat_arguments(c)
         else:
             raise Exception(
                 f"Model {model_family.model_name} is not support tool calls."
@@ -747,9 +371,9 @@ def process_tokens(tokens: str, delta: str):
             return lambda tokens, delta: delta
 
     @classmethod
-    def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools):
+    def _tool_calls_completion_chunk(cls, model_family, model_uid, c):
         _id = str(uuid.uuid4())
-        content, func, args = cls._eval_tool_arguments(model_family, c, tools)
+        content, func, args = cls._eval_tool_arguments(model_family, c)
         if func:
             d = {
                 "role": "assistant",
@@ -760,7 +384,7 @@ def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools):
                         "type": "function",
                         "function": {
                             "name": func,
-                            "arguments": json.dumps(args),
+                            "arguments": json.dumps(args, ensure_ascii=False),
                         },
                     }
                 ],
@@ -795,9 +419,9 @@ def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools):
         }
 
     @classmethod
-    def _tool_calls_completion(cls, model_family, model_uid, c, tools):
+    def _tool_calls_completion(cls, model_family, model_uid, c):
         _id = str(uuid.uuid4())
-        content, func, args = cls._eval_tool_arguments(model_family, c, tools)
+        content, func, args = cls._eval_tool_arguments(model_family, c)
         if func:
             m = {
                 "role": "assistant",
@@ -808,7 +432,7 @@ def _tool_calls_completion(cls, model_family, model_uid, c, tools):
                         "type": "function",
                         "function": {
                             "name": func,
-                            "arguments": json.dumps(args),
+                            "arguments": json.dumps(args, ensure_ascii=False),
                         },
                     }
                 ],
@@ -841,16 +465,6 @@ def _tool_calls_completion(cls, model_family, model_uid, c, tools):
             "usage": usage,
         }
 
-    @classmethod
-    def get_full_prompt(cls, model_family, prompt, system_prompt, chat_history, tools):
-        assert model_family.prompt_style is not None
-        prompt_style = model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-        chat_history = chat_history or []
-        full_prompt = cls.get_prompt(prompt, chat_history, prompt_style, tools=tools)
-        return full_prompt
-
 
 def get_file_location(
     llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
@@ -903,3 +517,94 @@ def _decode_image(_url):
             return Image.open(_url).convert("RGB")
         else:
             return Image.open(BytesIO(response.content)).convert("RGB")
+
+
+@typing.no_type_check
+def generate_completion_chunk(
+    chunk_text: Optional[str],
+    finish_reason: Optional[str],
+    chunk_id: str,
+    model_uid: str,
+    prompt_tokens: int,
+    completion_tokens: int,
+    total_tokens: int,
+    has_choice: bool = True,
+    has_content: bool = True,
+):
+    choices = []
+    if has_choice:
+        choices.append(
+            CompletionChoice(
+                text=chunk_text, index=0, logprobs=None, finish_reason=finish_reason
+            )
+            if has_content
+            else CompletionChoice(index=0, logprobs=None, finish_reason=finish_reason)
+        )
+    return CompletionChunk(
+        id=chunk_id,
+        object="text_completion",
+        created=int(time.time()),
+        model=model_uid,
+        choices=choices,
+        usage=CompletionUsage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+        ),
+    )
+
+
+def generate_chat_completion(
+    model_uid: str,
+    response: str,
+    prompt_tokens=-1,
+    completion_tokens=-1,
+    total_tokens=-1,
+    finish_reason="stop",
+) -> ChatCompletion:
+    return ChatCompletion(
+        id="chat" + str(uuid.uuid1()),
+        object="chat.completion",
+        created=int(time.time()),
+        model=model_uid,
+        choices=[
+            ChatCompletionChoice(
+                index=0,
+                message={"role": "assistant", "content": response},
+                finish_reason=finish_reason,
+            )
+        ],
+        usage=CompletionUsage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+        ),
+    )
+
+
+@functools.lru_cache
+def get_stop_token_ids_from_config_file(model_path: str) -> Optional[List[int]]:
+    from transformers import GenerationConfig as TransformersGenerationConfig
+
+    transformers_config = TransformersGenerationConfig.from_pretrained(model_path)
+    if transformers_config.eos_token_id is not None:
+        stop_token_ids = (
+            transformers_config.eos_token_id
+            if isinstance(transformers_config.eos_token_id, list)
+            else [transformers_config.eos_token_id]
+        )
+        return stop_token_ids
+    return None
+
+
+def parse_messages(messages: List[Dict]) -> Tuple:
+    """
+    Some older models still follow the old way of parameter passing.
+    This function helps to parse out the needed information from OpenAI-compatible `messages`.
+    """
+    system_messages = [mess["content"] for mess in messages if mess["role"] == "system"]
+    content_messages = [mess for mess in messages if mess["role"] != "system"]
+    prompt = content_messages[-1]["content"]
+    system_prompt = ". ".join(system_messages) if system_messages else None
+    chat_history = content_messages[:-1]
+    return prompt, system_prompt, chat_history
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 372efc7a3e..e97df3b8a1 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import asyncio
-import json
 import logging
 import multiprocessing
 import os
@@ -24,9 +23,9 @@
     Any,
     AsyncGenerator,
     Dict,
-    Iterable,
     List,
     Optional,
+    Tuple,
     TypedDict,
     Union,
 )
@@ -34,18 +33,20 @@
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
     LoRA,
-    ToolCallFunction,
-    ToolCalls,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
-from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin
+from ..utils import (
+    QWEN_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_SYMBOLS,
+    ChatModelMixin,
+    generate_completion_chunk,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -363,23 +364,28 @@ def match(
     @staticmethod
     def _convert_request_output_to_completion_chunk(
         request_id: str, model: str, request_output: "RequestOutput"
-    ) -> CompletionChunk:
+    ) -> Tuple[CompletionChunk, Optional[str]]:
         choices: List[CompletionChoice] = []
+        finish_reason = None
         for output in request_output.outputs:
             choices.append(
                 CompletionChoice(
                     text=output.text,
                     index=output.index,
                     logprobs=None,  # TODO: support logprobs.
-                    finish_reason=output.finish_reason,
+                    finish_reason=None,
                 )
             )
-        return CompletionChunk(
-            id=request_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=model,
-            choices=choices,
+            finish_reason = output.finish_reason
+        return (
+            CompletionChunk(
+                id=request_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=model,
+                choices=choices,
+            ),
+            finish_reason,
         )
 
     @staticmethod
@@ -463,10 +469,14 @@ async def async_generate(
 
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
             previous_texts = [""] * sanitized_generate_config["n"]
-            tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
             prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+            complete_response = ""
+            match_tool_call_tmp_results = []
+            is_match_tool_call = False
+            chunk = None
+            finish_reason = None
             async for _request_output in results_generator:
-                chunk = self._convert_request_output_to_completion_chunk(
+                chunk, finish_reason = self._convert_request_output_to_completion_chunk(
                     request_id=request_id,
                     model=self.model_uid,
                     request_output=_request_output,
@@ -476,40 +486,8 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
                     delta = choice["text"][len(previous_texts[i]) :]
                     previous_texts[i] = choice["text"]
                     choice["text"] = delta
+                    complete_response += delta
 
-                if tools:
-                    # only handle the first choice
-                    choice = chunk["choices"][0]
-                    if choice["finish_reason"] is not None:
-                        # use previous text for evaluation temporarily
-                        choice_delta = choice["text"]
-                        choice["text"] = previous_texts[0]
-                        _content, func, args = ChatModelMixin._eval_tool_arguments(
-                            self.model_family, chunk, tools
-                        )
-                        choice["text"] = tools_token_filter(
-                            tokens=previous_texts[0], delta=choice_delta
-                        )
-                        if func is not None:
-                            choice["text"] = None
-                            choice["finish_reason"] = "tool_calls"
-                            choice["tool_calls"] = [
-                                ToolCalls(
-                                    id=str(uuid.uuid4()),
-                                    type="function",
-                                    function=ToolCallFunction(
-                                        name=func,
-                                        arguments=json.dumps(args, ensure_ascii=False),
-                                    ),
-                                )
-                            ]
-                    else:
-                        # use a filter function to skip Qwen's react thought process
-                        choice["text"] = tools_token_filter(
-                            tokens=previous_texts[0], delta=choice["text"]
-                        )
-                        if not choice["text"]:
-                            continue
                 prompt_tokens = len(_request_output.prompt_token_ids)
                 completion_tokens = sum(
                     len(output.token_ids) for output in _request_output.outputs
@@ -520,7 +498,61 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
                 )
+
+                if tools:
+                    """
+                    The qwen2 tool call returns format like this:
+                    <tool_call>
+                    {...}
+                    </tool_call>
+                    Here is to match this.
+                    """
+                    if (len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)) and (
+                        not QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response)
+                    ):
+                        for c in match_tool_call_tmp_results:
+                            yield c
+                        match_tool_call_tmp_results.clear()
+                        yield chunk
+                    elif (len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)) and (
+                        QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response)
+                    ):
+                        match_tool_call_tmp_results.append(chunk)
+                    else:
+                        assert len(QWEN_TOOL_CALL_SYMBOLS[0]) <= len(complete_response)
+                        if not is_match_tool_call and complete_response.startswith(
+                            QWEN_TOOL_CALL_SYMBOLS[0]
+                        ):
+                            is_match_tool_call = True
+                            match_tool_call_tmp_results.clear()
+
+                        if not is_match_tool_call:
+                            for c in match_tool_call_tmp_results:
+                                yield c
+                            match_tool_call_tmp_results.clear()
+                            yield chunk
+                        else:
+                            chunk["choices"][0]["text"] = complete_response
+                else:
+                    yield chunk
+
+            if is_match_tool_call:
+                assert chunk is not None
                 yield chunk
+
+            # match OpenAI API stream
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=finish_reason,
+                chunk_id=request_id,
+                model_uid=self.model_uid,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+                has_choice=True,
+                has_content=False,
+            )
+
             if include_usage:
                 chunk = CompletionChunk(
                     id=request_id,
@@ -586,59 +618,68 @@ def _sanitize_chat_config(
     ) -> Dict:
         if not generate_config:
             generate_config = {}
-        if self.model_family.prompt_style:
-            if (
-                not generate_config.get("stop")
-            ) and self.model_family.prompt_style.stop:
-                generate_config["stop"] = self.model_family.prompt_style.stop.copy()
-            if self.model_family.prompt_style.stop_token_ids:
-                generate_config.setdefault(
-                    "stop_token_ids",
-                    self.model_family.prompt_style.stop_token_ids.copy(),
-                )
+        if not generate_config.get("stop") and self.model_family.stop:
+            generate_config["stop"] = self.model_family.stop.copy()
+        if (
+            not generate_config.get("stop_token_ids")
+            and self.model_family.stop_token_ids
+        ):
+            generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
         return generate_config
 
+    @staticmethod
+    def is_tool_call_chunk(chunk):
+        return chunk["choices"][0]["text"].startswith(QWEN_TOOL_CALL_SYMBOLS[0])
+
+    async def _async_to_tool_completion_chunks(
+        self,
+        chunks: AsyncGenerator[CompletionChunk, None],
+    ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        i = 0
+        async for chunk in chunks:
+            if i == 0:
+                yield self._get_first_chat_completion_chunk(chunk)
+            # usage
+            choices = chunk.get("choices")
+            if not choices:
+                yield self._get_final_chat_completion_chunk(chunk)
+            else:
+                if self.is_tool_call_chunk(chunk):
+                    yield self._tool_calls_completion_chunk(
+                        self.model_family, self.model_uid, chunk
+                    )
+                else:
+                    yield self._to_chat_completion_chunk(chunk)
+            i += 1
+
     async def async_chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-        chat_history = chat_history or []
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
-
-        generate_config = self._sanitize_chat_config(generate_config)
-        # TODO(codingl2k1): qwen hacky to set stop for function call.
         model_family = self.model_family.model_family or self.model_family.model_name
+        full_context_kwargs = {}
         if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            stop = generate_config.get("stop")
-            if isinstance(stop, str):
-                generate_config["stop"] = [stop, "Observation:"]
-            elif isinstance(stop, Iterable):
-                assert not isinstance(stop, str)
-                generate_config["stop"] = list(stop) + ["Observation:"]
-            else:
-                generate_config["stop"] = "Observation:"
+            full_context_kwargs["tools"] = tools
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
+        )
 
+        generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
 
         if stream:
             agen = await self.async_generate(full_prompt, generate_config, tools)
             assert isinstance(agen, AsyncGenerator)
+            if tools:
+                return self._async_to_tool_completion_chunks(agen)
             return self._async_to_chat_completion_chunks(agen)
         else:
             c = await self.async_generate(full_prompt, generate_config)
             assert not isinstance(c, AsyncGenerator)
             if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, c, tools
-                )
+                return self._tool_calls_completion(self.model_family, self.model_uid, c)
             return self._to_chat_completion(c)
 
 
@@ -666,28 +707,28 @@ def _sanitize_chat_config(
         self,
         generate_config: Optional[Dict] = None,
     ) -> Dict:
+        from ..utils import get_stop_token_ids_from_config_file
+
         if not generate_config:
             generate_config = {}
-        if self.model_family.prompt_style:
-            if self.model_family.prompt_style.stop_token_ids:
+        if generate_config.get("stop_token_ids", None) is None:
+            stop_token_ids = get_stop_token_ids_from_config_file(self.model_path)
+            if stop_token_ids is not None:
+                generate_config.setdefault("stop_token_ids", stop_token_ids)
+            else:
                 generate_config.setdefault(
-                    "stop_token_ids",
-                    self.model_family.prompt_style.stop_token_ids.copy(),
+                    "stop_token_ids", self.model_family.stop_token_ids.copy()
                 )
         return generate_config
 
     async def async_chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         # only support single image, waiting vllm support multi images
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        chat_history = chat_history or []
-        prompt, images = self.get_prompt(prompt, chat_history, prompt_style)
+        model_family = self.model_family.model_family or self.model_family.model_name
+        prompt, images = self.get_specific_prompt(model_family, messages)
 
         if len(images) == 0:
             inputs = {
diff --git a/xinference/types.py b/xinference/types.py
index 3f636d94c3..fee7c54948 100644
--- a/xinference/types.py
+++ b/xinference/types.py
@@ -39,8 +39,6 @@
     top_p_field,
 )
 
-SPECIAL_TOOL_PROMPT = "<TOOL>"
-
 
 class Image(TypedDict):
     url: Optional[str]
@@ -142,7 +140,7 @@ class ToolCalls(TypedDict):
 
 
 class CompletionChoice(TypedDict):
-    text: str
+    text: NotRequired[str]
     index: int
     logprobs: Optional[CompletionLogprobs]
     finish_reason: Optional[str]

From 7ff5951f20599bc49c72a6a2f807a57b33b1ed83 Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Thu, 29 Aug 2024 17:36:00 +0800
Subject: [PATCH 02/15] fix mypy

---
 xinference/core/supervisor.py                   | 2 +-
 xinference/model/image/stable_diffusion/core.py | 2 +-
 xinference/model/llm/llama_cpp/core.py          | 1 +
 xinference/model/llm/mlx/core.py                | 1 +
 xinference/model/llm/sglang/core.py             | 1 +
 xinference/model/llm/transformers/core.py       | 8 +++++++-
 xinference/model/llm/vllm/core.py               | 8 +++++---
 7 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 61fc4caa8e..1a522333d8 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -1027,7 +1027,7 @@ async def _launch_model():
         else:
             task = asyncio.create_task(_launch_model())
             ASYNC_LAUNCH_TASKS[model_uid] = task
-            task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
+            task.add_done_callback(lambda _: callback_for_async_launch(model_uid))  # type: ignore
         return model_uid
 
     async def get_instance_info(
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index b00ee2de46..041774843e 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -198,7 +198,7 @@ def _gen_base64_image(_img):
 
             with ThreadPoolExecutor() as executor:
                 results = list(map(partial(executor.submit, _gen_base64_image), images))  # type: ignore
-                image_list = [Image(url=None, b64_json=s.result()) for s in results]
+                image_list = [Image(url=None, b64_json=s.result()) for s in results]  # type: ignore
             return ImageList(created=int(time.time()), data=image_list)
         else:
             raise ValueError(f"Unsupported response format: {response_format}")
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 30a835ff7c..28b8df2402 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -277,6 +277,7 @@ def chat(
         full_context_kwargs = {}
         if tools and model_family in QWEN_TOOL_CALL_FAMILY:
             full_context_kwargs["tools"] = tools
+        assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs
         )
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 07966fcbba..fd82c03798 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -379,6 +379,7 @@ def chat(
         full_context_kwargs = {}
         if tools and model_family in QWEN_TOOL_CALL_FAMILY:
             full_context_kwargs["tools"] = tools
+        assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs
         )
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 7d2566ee27..8e36cd193f 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -442,6 +442,7 @@ async def async_chat(
         messages: List[Dict],
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
+        assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(messages, self.model_family.chat_template)
 
         generate_config = self._sanitize_chat_config(generate_config)
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index fd7d75b22e..a6e5a14d0f 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -618,7 +618,11 @@ def get_builtin_stop_token_ids(self) -> Tuple:
         if stop_token_ids is not None:
             return tuple(stop_token_ids)
         else:
-            return tuple(self.model_family.stop_token_ids)
+            return (
+                tuple(self.model_family.stop_token_ids)
+                if self.model_family.stop_token_ids
+                else tuple()
+            )
 
     def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
         for req in req_list:
@@ -724,6 +728,7 @@ def chat(
         full_context_kwargs = {}
         if tools and model_family in QWEN_TOOL_CALL_FAMILY:
             full_context_kwargs["tools"] = tools
+        assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages,
             self.model_family.chat_template,
@@ -749,6 +754,7 @@ def load(self):
         super().load()
 
     def _get_full_prompt(self, messages: List[Dict], tools):
+        assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, tokenizer=self._tokenizer
         )
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index e97df3b8a1..3b4d77f293 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -662,6 +662,7 @@ async def async_chat(
         full_context_kwargs = {}
         if tools and model_family in QWEN_TOOL_CALL_FAMILY:
             full_context_kwargs["tools"] = tools
+        assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs
         )
@@ -716,9 +717,10 @@ def _sanitize_chat_config(
             if stop_token_ids is not None:
                 generate_config.setdefault("stop_token_ids", stop_token_ids)
             else:
-                generate_config.setdefault(
-                    "stop_token_ids", self.model_family.stop_token_ids.copy()
-                )
+                if self.model_family.stop_token_ids:
+                    generate_config.setdefault(
+                        "stop_token_ids", self.model_family.stop_token_ids.copy()
+                    )
         return generate_config
 
     async def async_chat(

From 3c43cffcdfadc0a44113981643417adfe62e9784 Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Thu, 29 Aug 2024 18:47:25 +0800
Subject: [PATCH 03/15] fix UT

---
 xinference/client/tests/test_client.py     | 69 +++++++++++++++-------
 xinference/model/llm/llama_cpp/core.py     |  7 +--
 xinference/model/llm/mlx/core.py           |  6 +-
 xinference/model/llm/transformers/utils.py |  6 +-
 xinference/model/llm/utils.py              |  6 +-
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py
index e6bd554129..fd785b034c 100644
--- a/xinference/client/tests/test_client.py
+++ b/xinference/client/tests/test_client.py
@@ -83,9 +83,14 @@ def _check_stream():
             generate_config={"stream": True, "max_tokens": 5},
         )
         for chunk in streaming_response:
-            assert ("content" in chunk["choices"][0]["delta"]) or (
-                "role" in chunk["choices"][0]["delta"]
-            )
+            assert "finish_reason" in chunk["choices"][0]
+            finish_reason = chunk["choices"][0]["finish_reason"]
+            if finish_reason is None:
+                assert ("content" in chunk["choices"][0]["delta"]) or (
+                    "role" in chunk["choices"][0]["delta"]
+                )
+            else:
+                assert chunk["choices"][0]["delta"] == {}
 
     _check_stream()
 
@@ -218,7 +223,6 @@ def test_RESTful_client_custom_model(setup):
     "en", "zh"
   ],
   "model_ability": [
-    "embed",
     "chat"
   ],
   "model_family": "other",
@@ -234,15 +238,9 @@ def test_RESTful_client_custom_model(setup):
       "model_id": "ziqingyang/chinese-alpaca-2-7b"
     }
   ],
-  "prompt_style": {
-    "style_name": "ADD_COLON_SINGLE",
-    "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-    "roles": [
-      "Instruction",
-      "Response"
-    ],
-    "intra_message_sep": "\\n\\n### "
-  }
+  "chat_template": "xyz",
+  "stop_token_ids": [],
+  "stop": []
 }"""
     client.register_model(model_type="LLM", model=model, persist=False)
 
@@ -266,7 +264,7 @@ def test_RESTful_client_custom_model(setup):
             custom_model_reg = model_reg
     assert custom_model_reg is None
 
-    # test register with string prompt style name
+    # test register with chat_template using model_family
     model_with_prompt = """{
   "version": 1,
   "context_length":2048,
@@ -291,12 +289,12 @@ def test_RESTful_client_custom_model(setup):
       "model_id": "ziqingyang/chinese-alpaca-2-7b"
     }
   ],
-  "prompt_style": "qwen-chat"
+  "chat_template": "qwen-chat"
 }"""
     client.register_model(model_type="LLM", model=model_with_prompt, persist=False)
     client.unregister_model(model_type="LLM", model_name="custom_model")
 
-    model_with_prompt2 = """{
+    model_with_vision = """{
       "version": 1,
       "context_length":2048,
       "model_name": "custom_model",
@@ -304,8 +302,8 @@ def test_RESTful_client_custom_model(setup):
         "en", "zh"
       ],
       "model_ability": [
-        "embed",
-        "chat"
+        "chat",
+        "vision"
       ],
       "model_family": "other",
       "model_specs": [
@@ -320,10 +318,41 @@ def test_RESTful_client_custom_model(setup):
           "model_id": "ziqingyang/chinese-alpaca-2-7b"
         }
       ],
-      "prompt_style": "xyz123"
+      "chat_template": "xyz123"
     }"""
     with pytest.raises(RuntimeError):
-        client.register_model(model_type="LLM", model=model_with_prompt2, persist=False)
+        client.register_model(model_type="LLM", model=model_with_vision, persist=False)
+
+    model_with_tool_call = """{
+          "version": 1,
+          "context_length":2048,
+          "model_name": "custom_model",
+          "model_lang": [
+            "en", "zh"
+          ],
+          "model_ability": [
+            "chat",
+            "tools"
+          ],
+          "model_family": "other",
+          "model_specs": [
+            {
+              "model_format": "pytorch",
+              "model_size_in_billions": 7,
+              "quantizations": [
+                "4-bit",
+                "8-bit",
+                "none"
+              ],
+              "model_id": "ziqingyang/chinese-alpaca-2-7b"
+            }
+          ],
+          "chat_template": "xyz123"
+        }"""
+    with pytest.raises(RuntimeError):
+        client.register_model(
+            model_type="LLM", model=model_with_tool_call, persist=False
+        )
 
 
 def test_client_from_modelscope(setup):
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 28b8df2402..8e4929cbfe 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -182,13 +182,10 @@ def generator_wrapper(
             ):
                 _completion_chunk["model"] = self.model_uid
                 request_id = _completion_chunk["id"]
-                choice = _completion_chunk["choices"][0]
-                if choice["finish_reason"] is not None:
-                    completion_tokens = index
-                    choice.pop("text", None)
+                completion_tokens = index + 1
                 total_tokens = prompt_tokens + completion_tokens
                 _completion_chunk["usage"] = CompletionUsage(
-                    prompt_tokens=total_tokens,
+                    prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
                 )
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index fd82c03798..23bb19ed11 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -245,15 +245,13 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
         )
         if stream:
             yield generate_completion_chunk(
-                None,
+                "",
                 finish_reason=finish_reason,
                 chunk_id=chunk_id,
                 model_uid=model_uid,
                 prompt_tokens=input_echo_len,
                 completion_tokens=i,
                 total_tokens=(input_echo_len + i),
-                has_choice=True,
-                has_content=False,
             ), completion_usage
         else:
             yield generate_completion_chunk(
@@ -264,8 +262,6 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
                 prompt_tokens=input_echo_len,
                 completion_tokens=i,
                 total_tokens=(input_echo_len + i),
-                has_choice=True,
-                has_content=True,
             ), completion_usage
 
         if include_usage:
diff --git a/xinference/model/llm/transformers/utils.py b/xinference/model/llm/transformers/utils.py
index d34112d24f..ed07f1b65e 100644
--- a/xinference/model/llm/transformers/utils.py
+++ b/xinference/model/llm/transformers/utils.py
@@ -321,7 +321,7 @@ def generate_stream(
 
     if stream:
         completion_choice = CompletionChoice(
-            index=0, logprobs=None, finish_reason=finish_reason
+            text=output, index=0, logprobs=None, finish_reason=finish_reason
         )
     else:
         completion_choice = CompletionChoice(
@@ -692,15 +692,13 @@ def _batch_inference_one_step_internal(
                     if r.stopped:
                         # OpenAI compatible chunk
                         completion_chunk = generate_completion_chunk(
-                            chunk_text=None,
+                            chunk_text="",
                             finish_reason=r.finish_reason,
                             chunk_id=r.chunk_id,
                             model_uid=model_uid,
                             prompt_tokens=len(r.prompt_tokens),
                             completion_tokens=len(r.new_tokens),
                             total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
-                            has_choice=True,
-                            has_content=False,
                         )
                         r.completion.append(completion_chunk)
                         r.completion.append(eos_flag)
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 974671720e..5b9c5fc70e 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -191,7 +191,11 @@ def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChun
                 {
                     "index": i,
                     "delta": {
-                        **({"content": choice["text"]} if "text" in choice else {}),
+                        **(
+                            {"content": choice["text"]}
+                            if ("text" in choice and choice["finish_reason"] is None)
+                            else {}
+                        ),
                         **(
                             {"tool_calls": choice["tool_calls"]}
                             if "tool_calls" in choice

From dfe1dffb33e5e7e80663190feaf880e2cb807823 Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Thu, 29 Aug 2024 18:56:38 +0800
Subject: [PATCH 04/15] fix

---
 xinference/model/llm/mlx/core.py    | 19 +++++--------------
 xinference/model/llm/sglang/core.py |  4 +---
 xinference/model/llm/vllm/core.py   |  4 +---
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 23bb19ed11..7553cd64b4 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -24,7 +24,6 @@
     ChatCompletion,
     ChatCompletionChunk,
     Completion,
-    CompletionChoice,
     CompletionChunk,
     CompletionUsage,
     LoRA,
@@ -211,24 +210,16 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
             else:
                 output += out
 
-            completion_choice = CompletionChoice(
-                text=output, index=0, logprobs=None, finish_reason=None
-            )
-            completion_chunk = CompletionChunk(
-                id=chunk_id,
-                object="text_completion",
-                created=int(time.time()),
-                model=model_uid,
-                choices=[completion_choice],
-            )
-            completion_usage = CompletionUsage(
+            yield generate_completion_chunk(
+                chunk_text=output,
+                finish_reason=None,
+                chunk_id=chunk_id,
+                model_uid=model_uid,
                 prompt_tokens=input_echo_len,
                 completion_tokens=i,
                 total_tokens=(input_echo_len + i),
             )
 
-            yield completion_chunk, completion_usage
-
         logger.info(
             f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."
         )
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 8e36cd193f..b2b830d23c 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -373,15 +373,13 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
                     else finish_reason
                 )
                 yield generate_completion_chunk(
-                    None,
+                    "",
                     finish_reason=finish_reason,
                     chunk_id=request_id,
                     model_uid=self.model_uid,
                     prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
-                    has_choice=True,
-                    has_content=False,
                 )
 
                 if include_usage:
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 3b4d77f293..3a142c7de7 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -542,15 +542,13 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
 
             # match OpenAI API stream
             yield generate_completion_chunk(
-                chunk_text=None,
+                chunk_text="",
                 finish_reason=finish_reason,
                 chunk_id=request_id,
                 model_uid=self.model_uid,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
-                has_choice=True,
-                has_content=False,
             )
 
             if include_usage:

From ecdb930f95908b621d3632f77df471ec8c7624fc Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Fri, 30 Aug 2024 11:03:26 +0800
Subject: [PATCH 05/15] fix mlx UT

---
 xinference/model/llm/mlx/core.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 7553cd64b4..d01324fbf5 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -210,6 +210,12 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
             else:
                 output += out
 
+            completion_usage = CompletionUsage(
+                prompt_tokens=input_echo_len,
+                completion_tokens=i,
+                total_tokens=(input_echo_len + i),
+            )
+
             yield generate_completion_chunk(
                 chunk_text=output,
                 finish_reason=None,
@@ -218,7 +224,7 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
                 prompt_tokens=input_echo_len,
                 completion_tokens=i,
                 total_tokens=(input_echo_len + i),
-            )
+            ), completion_usage
 
         logger.info(
             f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."

From 0b01c31d7fea925f4ecd8be9c416351e87d2ee5f Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Fri, 30 Aug 2024 11:16:45 +0800
Subject: [PATCH 06/15] fix UT

---
 xinference/client/tests/test_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py
index fd785b034c..62f91a87ec 100644
--- a/xinference/client/tests/test_client.py
+++ b/xinference/client/tests/test_client.py
@@ -73,7 +73,7 @@ def test_RESTful_client(setup):
     with pytest.raises(RuntimeError):
         completion = model.chat({"max_tokens": 64})
 
-    messages = {"role": "user", "content": "What is the capital of France?"}
+    messages = [{"role": "user", "content": "What is the capital of France?"}]
     completion = model.chat(messages)
     assert "content" in completion["choices"][0]["message"]
 

From a8adee41f58ab5019de81dfc2f1dc35ff83062a6 Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Fri, 30 Aug 2024 15:20:05 +0800
Subject: [PATCH 07/15] fix continous batching streaming format issue when chat
 model calls generate interface

---
 xinference/core/model.py                  | 12 ++++++++----
 xinference/core/scheduler.py              | 22 ++++++++++++++--------
 xinference/model/llm/transformers/core.py | 14 ++++++++++++++
 3 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/xinference/core/model.py b/xinference/core/model.py
index 10ab759fe6..cec574ab6d 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -439,7 +439,9 @@ async def _call_wrapper(self, output_type: str, fn: Callable, *args, **kwargs):
     @log_async(logger=logger)
     async def generate(self, prompt: str, *args, **kwargs):
         if self.allow_batching():
-            return await self.handle_batching_request(prompt, *args, **kwargs)
+            return await self.handle_batching_request(
+                prompt, "generate", *args, **kwargs
+            )
         else:
             kwargs.pop("raw_params", None)
             if hasattr(self._model, "generate"):
@@ -484,7 +486,7 @@ def _get_stream_from_args(*args) -> bool:
         return False if args[0] is None else args[0].get("stream", False)
 
     async def handle_batching_request(
-        self, prompt_or_messages: Union[str, List[Dict]], *args, **kwargs
+        self, prompt_or_messages: Union[str, List[Dict]], call_ability, *args, **kwargs
     ):
         """
         The input parameter `prompt_or_messages`:
@@ -498,7 +500,7 @@ async def handle_batching_request(
             queue: Queue[Any] = Queue()
             ret = self._queue_consumer(queue)
             await self._scheduler_ref.add_request(
-                prompt_or_messages, queue, *args, **kwargs
+                prompt_or_messages, queue, call_ability, *args, **kwargs
             )
             gen = self._to_async_gen("json", ret)
             self._current_generator = weakref.ref(gen)
@@ -527,7 +529,9 @@ async def chat(self, messages: List[Dict], *args, **kwargs):
         response = None
         try:
             if self.allow_batching():
-                return await self.handle_batching_request(messages, *args, **kwargs)
+                return await self.handle_batching_request(
+                    messages, "chat", *args, **kwargs
+                )
             else:
                 kwargs.pop("raw_params", None)
                 if hasattr(self._model, "chat"):
diff --git a/xinference/core/scheduler.py b/xinference/core/scheduler.py
index 842b8bd737..6f4af5bfc9 100644
--- a/xinference/core/scheduler.py
+++ b/xinference/core/scheduler.py
@@ -38,7 +38,13 @@ class AbortRequestMessage(Enum):
 
 class InferenceRequest:
     def __init__(
-        self, prompt_or_messages, future_or_queue, is_prefill, *args, **kwargs
+        self,
+        prompt_or_messages,
+        future_or_queue,
+        is_prefill,
+        call_ability,
+        *args,
+        **kwargs,
     ):
         # original prompt, prompt(str) for generate model and messages(List[Dict]) for chat model
         self._prompt = prompt_or_messages
@@ -46,6 +52,9 @@ def __init__(
         self._full_prompt = None
         # whether the current request is in the prefill phase
         self._is_prefill = is_prefill
+        # the ability that the user calls this model for, that is `generate` / `chat` for now,
+        # which is for results formatting
+        self._call_ability = call_ability
         # full prompt tokens
         self._prompt_tokens = None
         # all new generated tokens during decode phase
@@ -104,12 +113,8 @@ def prompt(self):
         return self._prompt
 
     @property
-    def system_prompt(self):
-        return self._inference_args[0]
-
-    @property
-    def chat_history(self):
-        return self._inference_args[1]
+    def call_ability(self):
+        return self._call_ability
 
     @property
     def full_prompt(self):
@@ -413,11 +418,12 @@ async def add_request(
         self,
         prompt_or_messages: Union[str, List[Dict]],
         future_or_queue,
+        call_ability,
         *args,
         **kwargs,
     ):
         req = InferenceRequest(
-            prompt_or_messages, future_or_queue, True, *args, **kwargs
+            prompt_or_messages, future_or_queue, True, call_ability, *args, **kwargs
         )
         rid = req.request_id
         if rid is not None:
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index a6e5a14d0f..05feced99d 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -774,6 +774,20 @@ def prepare_batch_inference(self, req_list: List[InferenceRequest]):
     def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
         for req in req_list:
             if req.error_msg is None and req.completion:
+                # The `generate` function can be called for some chat models.
+                # So that we cannot convert completion chunk to chat completion chunk.
+                if req.call_ability == "generate":
+                    results = []
+                    for c in req.completion:
+                        if c == "<bos_stream>":
+                            continue
+                        elif c == "<eos_stream>":
+                            break
+                        else:
+                            results.append(c)
+                    req.completion = results
+                    continue
+
                 if req.stream:
                     results = []
                     for i, c in enumerate(req.completion):

From 155715d4491c19bc1730ca8358dd0782e9fbdb3b Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Fri, 30 Aug 2024 16:05:53 +0800
Subject: [PATCH 08/15] fix UT

---
 xinference/core/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/core/model.py b/xinference/core/model.py
index cec574ab6d..c8e73c3060 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -511,7 +511,7 @@ async def handle_batching_request(
             assert self._loop is not None
             future = ConcurrentFuture()
             await self._scheduler_ref.add_request(
-                prompt_or_messages, future, *args, **kwargs
+                prompt_or_messages, future, call_ability, *args, **kwargs
             )
             fut = asyncio.wrap_future(future, loop=self._loop)
             result = await fut

From 86a32bb34d971b1c7b57d3496c6214b0abf164c3 Mon Sep 17 00:00:00 2001
From: yiboyasss <3359595624@qq.com>
Date: Wed, 4 Sep 2024 16:45:33 +0800
Subject: [PATCH 09/15] ui: register page

---
 xinference/web/ui/package-lock.json           |  38 ++
 xinference/web/ui/package.json                |   1 +
 .../register_model/components/addStop.js      | 107 ++++
 .../web/ui/src/scenes/register_model/index.js |   3 +-
 .../scenes/register_model/registerModel.js    | 489 ++++++++++--------
 .../styles/registerModelStyle.css             |  23 +
 6 files changed, 452 insertions(+), 209 deletions(-)
 create mode 100644 xinference/web/ui/src/scenes/register_model/components/addStop.js

diff --git a/xinference/web/ui/package-lock.json b/xinference/web/ui/package-lock.json
index 0730d3b275..7f15648e74 100644
--- a/xinference/web/ui/package-lock.json
+++ b/xinference/web/ui/package-lock.json
@@ -29,6 +29,7 @@
         "@testing-library/user-event": "^13.5.0",
         "clipboard": "^2.0.11",
         "formik": "^2.4.2",
+        "nunjucks": "^3.2.4",
         "prop-types": "^15.8.1",
         "react": "^18.2.0",
         "react-cookie": "^6.1.1",
@@ -5799,6 +5800,11 @@
       "resolved": "https://registry.npmjs.org/@xtuc/long/-/long-4.2.2.tgz",
       "integrity": "sha512-NuHqBY1PB/D8xU6s/thBgOAiAP7HOYDQ32+BFZILJ8ivkUkAHQnWfn6WhL79Owj1qmUnoN/YPhktdIoucipkAQ=="
     },
+    "node_modules/a-sync-waterfall": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmmirror.com/a-sync-waterfall/-/a-sync-waterfall-1.0.1.tgz",
+      "integrity": "sha512-RYTOHHdWipFUliRFMCS4X2Yn2X8M87V/OpSqWzKKOGhzqyUxzyVmhHDH9sAvG+ZuQf/TAOFsLCpMw09I1ufUnA=="
+    },
     "node_modules/abab": {
       "version": "2.0.6",
       "resolved": "https://registry.npmjs.org/abab/-/abab-2.0.6.tgz",
@@ -13750,6 +13756,38 @@
         "url": "https://github.com/fb55/nth-check?sponsor=1"
       }
     },
+    "node_modules/nunjucks": {
+      "version": "3.2.4",
+      "resolved": "https://registry.npmmirror.com/nunjucks/-/nunjucks-3.2.4.tgz",
+      "integrity": "sha512-26XRV6BhkgK0VOxfbU5cQI+ICFUtMLixv1noZn1tGU38kQH5A5nmmbk/O45xdyBhD1esk47nKrY0mvQpZIhRjQ==",
+      "dependencies": {
+        "a-sync-waterfall": "^1.0.0",
+        "asap": "^2.0.3",
+        "commander": "^5.1.0"
+      },
+      "bin": {
+        "nunjucks-precompile": "bin/precompile"
+      },
+      "engines": {
+        "node": ">= 6.9.0"
+      },
+      "peerDependencies": {
+        "chokidar": "^3.3.0"
+      },
+      "peerDependenciesMeta": {
+        "chokidar": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/nunjucks/node_modules/commander": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmmirror.com/commander/-/commander-5.1.0.tgz",
+      "integrity": "sha512-P0CysNDQ7rtVw4QIQtm+MRxV66vKFSvlsQvGYXZWR3qFU0jlMKHZZZgw8e+8DSah4UDKMqnknRDQz+xuQXQ/Zg==",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
     "node_modules/nwsapi": {
       "version": "2.2.7",
       "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.7.tgz",
diff --git a/xinference/web/ui/package.json b/xinference/web/ui/package.json
index 0a163ec52b..1bda015ba8 100644
--- a/xinference/web/ui/package.json
+++ b/xinference/web/ui/package.json
@@ -25,6 +25,7 @@
     "@testing-library/user-event": "^13.5.0",
     "clipboard": "^2.0.11",
     "formik": "^2.4.2",
+    "nunjucks": "^3.2.4",
     "prop-types": "^15.8.1",
     "react": "^18.2.0",
     "react-cookie": "^6.1.1",
diff --git a/xinference/web/ui/src/scenes/register_model/components/addStop.js b/xinference/web/ui/src/scenes/register_model/components/addStop.js
new file mode 100644
index 0000000000..0acca09981
--- /dev/null
+++ b/xinference/web/ui/src/scenes/register_model/components/addStop.js
@@ -0,0 +1,107 @@
+import AddIcon from '@mui/icons-material/Add'
+import DeleteIcon from '@mui/icons-material/Delete'
+import { Alert, Button, TextField } from '@mui/material'
+import React, { useEffect, useState } from 'react'
+
+const regex = /^[1-9]\d*$/
+
+const AddStop = ({ label, onGetData, arrItemType, formData, onGetError }) => {
+  const [dataArr, setDataArr] = useState(formData?.length ? formData : [''])
+  const arr = []
+
+  useEffect(() => {
+    if (arrItemType === 'number') {
+      const newDataArr = dataArr.map((item) => {
+        if (item && regex.test(item)) {
+          arr.push('true')
+          return Number(item)
+        }
+        if (item && !regex.test(item)) arr.push('false')
+        return item
+      })
+      onGetError(arr)
+      onGetData(newDataArr)
+    } else {
+      onGetData(dataArr)
+    }
+  }, [dataArr])
+
+  const handleChange = (value, index) => {
+    const arr = [...dataArr]
+    arr[index] = value
+    setDataArr([...arr])
+  }
+
+  const handleAdd = () => {
+    if (dataArr[dataArr.length - 1]) {
+      setDataArr([...dataArr, ''])
+    }
+  }
+
+  const handleDelete = (index) => {
+    setDataArr(dataArr.filter((_, subIndex) => index !== subIndex))
+  }
+
+  const handleShowAlert = (item) => {
+    return item !== '' && !regex.test(item) && arrItemType === 'number'
+  }
+
+  return (
+    <>
+      <div>
+        <div
+          style={{ display: 'flex', alignItems: 'center', marginBottom: 10 }}
+        >
+          <label>{label}</label>
+          <Button
+            variant="contained"
+            size="small"
+            endIcon={<AddIcon />}
+            className="addBtn"
+            onClick={handleAdd}
+          >
+            more
+          </Button>
+        </div>
+        <div
+          style={{
+            display: 'flex',
+            flexDirection: 'column',
+            gap: 10,
+            marginInline: 50,
+            padding: 20,
+            backgroundColor: '#eee',
+            borderRadius: 10,
+          }}
+        >
+          {dataArr.map((item, index) => (
+            <div key={index}>
+              <div style={{ display: 'flex', alignItems: 'center', gap: 5 }}>
+                <TextField
+                  value={item}
+                  onChange={(e) => handleChange(e.target.value, index)}
+                  size="small"
+                  style={{ width: '100%' }}
+                />
+                {dataArr.length > 1 && (
+                  <DeleteIcon
+                    onClick={() => handleDelete(index)}
+                    style={{ cursor: 'pointer', color: '#1976d2' }}
+                  />
+                )}
+              </div>
+
+              {handleShowAlert(item) && (
+                <Alert severity="error">
+                  Please enter an integer greater than 0.
+                </Alert>
+              )}
+            </div>
+          ))}
+        </div>
+      </div>
+    </>
+  )
+}
+
+export default AddStop
diff --git a/xinference/web/ui/src/scenes/register_model/index.js b/xinference/web/ui/src/scenes/register_model/index.js
index eb5b0a9e77..6aa0146bc9 100644
--- a/xinference/web/ui/src/scenes/register_model/index.js
+++ b/xinference/web/ui/src/scenes/register_model/index.js
@@ -63,7 +63,6 @@ const RegisterModel = () => {
               context_length: 2048,
               model_lang: ['en'],
               model_ability: ['generate'],
-              model_family: '',
               model_specs: [
                 {
                   model_uri: '/path/to/llama-1',
@@ -72,7 +71,7 @@ const RegisterModel = () => {
                   quantizations: ['none'],
                 },
               ],
-              prompt_style: undefined,
+              model_family: '',
             }}
           />
         </TabPanel>
diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js
index 06cc582927..717587d6d6 100644
--- a/xinference/web/ui/src/scenes/register_model/registerModel.js
+++ b/xinference/web/ui/src/scenes/register_model/registerModel.js
@@ -1,14 +1,20 @@
 import './styles/registerModelStyle.css'
 
-import CheckIcon from '@mui/icons-material/Check'
+import Cancel from '@mui/icons-material/Cancel'
+import CheckCircleIcon from '@mui/icons-material/CheckCircle'
 import KeyboardDoubleArrowRightIcon from '@mui/icons-material/KeyboardDoubleArrowRight'
 import NotesIcon from '@mui/icons-material/Notes'
+import OpenInFullIcon from '@mui/icons-material/OpenInFull'
 import {
   Alert,
   Box,
   Button,
   Checkbox,
   Chip,
+  Dialog,
+  DialogActions,
+  DialogContent,
+  DialogTitle,
   FormControl,
   FormControlLabel,
   InputLabel,
@@ -21,6 +27,7 @@ import {
   TextField,
   Tooltip,
 } from '@mui/material'
+import nunjucks from 'nunjucks'
 import React, { useContext, useEffect, useRef, useState } from 'react'
 import { useCookies } from 'react-cookie'
 import { useNavigate, useParams } from 'react-router-dom'
@@ -31,22 +38,27 @@ import fetchWrapper from '../../components/fetchWrapper'
 import { isValidBearerToken } from '../../components/utils'
 import AddControlnet from './components/addControlnet'
 import AddModelSpecs from './components/addModelSpecs'
+import AddStop from './components/addStop'
 import languages from './data/languages'
 const SUPPORTED_LANGUAGES_DICT = { en: 'English', zh: 'Chinese' }
 const SUPPORTED_FEATURES = ['Generate', 'Chat', 'Vision']
+const messages = [
+  {
+    role: 'assistant',
+    content: 'This is the message content replied by the assistant previously',
+  },
+  {
+    role: 'user',
+    content: 'This is the message content sent by the user currently',
+  },
+]
 
 // Convert dictionary of supported languages into list
 const SUPPORTED_LANGUAGES = Object.keys(SUPPORTED_LANGUAGES_DICT)
 
 const RegisterModelComponent = ({ modelType, customData }) => {
-  const endPoint = useContext(ApiContext).endPoint
   const { setErrorMsg } = useContext(ApiContext)
   const [formData, setFormData] = useState(customData)
-  const [promptStyles, setPromptStyles] = useState([])
-  const [family, setFamily] = useState({
-    chat: [],
-    generate: [],
-  })
   const [languagesArr, setLanguagesArr] = useState([])
   const [isContextLengthAlert, setIsContextLengthAlert] = useState(false)
   const [isDimensionsAlert, setIsDimensionsAlert] = useState(false)
@@ -73,6 +85,11 @@ const RegisterModelComponent = ({ modelType, customData }) => {
   )
   const [contrastObj, setContrastObj] = useState({})
   const [isEqual, setIsEqual] = useState(true)
+  const [testRes, setTestRes] = useState('')
+  const [isOpenMessages, setIsOpenMessages] = useState(false)
+  const [testErrorInfo, setTestErrorInfo] = useState('')
+  const [isTestSuccess, setIsTestSuccess] = useState(false)
+  const [isStopTokenIdsAlert, setIsStopTokenIdsAlert] = useState(false)
 
   useEffect(() => {
     if (model_name) {
@@ -93,7 +110,9 @@ const RegisterModelComponent = ({ modelType, customData }) => {
           model_ability,
           model_family,
           model_specs,
-          prompt_style,
+          chat_template,
+          stop_token_ids,
+          stop,
         } = data
         const specsDataArr = model_specs.map((item) => {
           const {
@@ -120,8 +139,10 @@ const RegisterModelComponent = ({ modelType, customData }) => {
           model_ability,
           model_family,
           model_specs: specsDataArr,
+          chat_template,
+          stop_token_ids,
+          stop,
         }
-        prompt_style ? (llmData.prompt_style = prompt_style) : ''
         setFormData(llmData)
         setContrastObj(llmData)
         setSpecsArr(specsDataArr)
@@ -217,79 +238,6 @@ const RegisterModelComponent = ({ modelType, customData }) => {
       navigate('/login', { replace: true })
       return
     }
-
-    const getBuiltinFamilies = async () => {
-      const response = await fetch(endPoint + '/v1/models/families', {
-        method: 'GET',
-        headers: {
-          'Content-Type': 'application/json',
-        },
-      })
-      if (!response.ok) {
-        const errorData = await response.json() // Assuming the server returns error details in JSON format
-        setErrorMsg(
-          `Server error: ${response.status} - ${
-            errorData.detail || 'Unknown error'
-          }`
-        )
-      } else {
-        const data = await response.json()
-        data.chat.push('other')
-        data.generate.push('other')
-        setFamily(data)
-      }
-    }
-
-    const getBuiltInPromptStyles = async () => {
-      const response = await fetch(endPoint + '/v1/models/prompts', {
-        method: 'GET',
-        headers: {
-          'Content-Type': 'application/json',
-        },
-      })
-      if (!response.ok) {
-        const errorData = await response.json() // Assuming the server returns error details in JSON format
-        setErrorMsg(
-          `Server error: ${response.status} - ${
-            errorData.detail || 'Unknown error'
-          }`
-        )
-      } else {
-        const data = await response.json()
-        let res = []
-        for (const key in data) {
-          let v = data[key]
-          v['name'] = key
-          res.push(v)
-        }
-        setPromptStyles(res)
-      }
-    }
-
-    if (
-      Object.prototype.hasOwnProperty.call(customData, 'model_ability') &&
-      Object.prototype.hasOwnProperty.call(customData, 'model_family')
-    ) {
-      // avoid keep requesting backend to get prompts
-      if (promptStyles.length === 0) {
-        getBuiltInPromptStyles().catch((error) => {
-          setErrorMsg(
-            error.message ||
-              'An unexpected error occurred when getting builtin prompt styles.'
-          )
-          console.error('Error: ', error)
-        })
-      }
-      if (family.chat.length === 0) {
-        getBuiltinFamilies().catch((error) => {
-          setErrorMsg(
-            error.message ||
-              'An unexpected error occurred when getting builtin prompt styles.'
-          )
-          console.error('Error: ', error)
-        })
-      }
-    }
   }, [cookie.token])
 
   useEffect(() => {
@@ -299,34 +247,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
     }
   }, [formData])
 
-  const getFamilyByAbility = () => {
-    if (
-      formData.model_ability.includes('chat') ||
-      formData.model_ability.includes('vision')
-    ) {
-      return family.chat
-    } else {
-      return family.generate
-    }
-  }
-
-  const sortStringsByFirstLetter = (arr) => {
-    return arr.sort((a, b) => {
-      const firstCharA = a.charAt(0).toLowerCase()
-      const firstCharB = b.charAt(0).toLowerCase()
-      if (firstCharA < firstCharB) {
-        return -1
-      }
-      if (firstCharA > firstCharB) {
-        return 1
-      }
-      return 0
-    })
-  }
-
   const handleClick = async () => {
-    console.log('formData', modelType, formData)
-
     for (let key in formData) {
       const type = Object.prototype.toString.call(formData[key]).slice(8, -1)
       if (
@@ -427,61 +348,26 @@ const RegisterModelComponent = ({ modelType, customData }) => {
   }
 
   const toggleAbility = (ability) => {
+    const obj = JSON.parse(JSON.stringify(formData))
     if (formData.model_ability.includes(ability)) {
-      const obj = JSON.parse(JSON.stringify(formData))
       if (ability === 'chat') {
-        delete obj.prompt_style
+        delete obj.chat_template
+        delete obj.stop_token_ids
+        delete obj.stop
       }
       setFormData({
         ...obj,
         model_ability: formData.model_ability.filter((a) => a !== ability),
-        model_family: '',
       })
     } else {
-      setFormData({
-        ...formData,
-        model_ability: [...formData.model_ability, ability],
-        model_family: '',
-      })
-    }
-  }
-
-  const toggleFamily = (value) => {
-    const ps = promptStyles.find((item) => item.name === value)
-    if (formData.model_ability.includes('chat') && ps) {
-      const prompt_style = {
-        style_name: ps.style_name,
-        system_prompt: ps.system_prompt,
-        roles: ps.roles,
-        intra_message_sep: ps.intra_message_sep,
-        inter_message_sep: ps.inter_message_sep,
-        stop: ps.stop ?? null,
-        stop_token_ids: ps.stop_token_ids ?? null,
+      if (ability === 'chat') {
+        obj.chat_template = ''
+        obj.stop_token_ids = []
+        obj.stop = []
       }
       setFormData({
-        ...formData,
-        model_family: value,
-        prompt_style,
-      })
-    } else {
-      const {
-        version,
-        model_name,
-        model_description,
-        context_length,
-        model_lang,
-        model_ability,
-        model_specs,
-      } = formData
-      setFormData({
-        version,
-        model_name,
-        model_description,
-        context_length,
-        model_lang,
-        model_ability,
-        model_family: value,
-        model_specs,
+        ...obj,
+        model_ability: [...formData.model_ability, ability],
       })
     }
   }
@@ -569,6 +455,58 @@ const RegisterModelComponent = ({ modelType, customData }) => {
     return true
   }
 
+  const handleTest = () => {
+    setTestRes('')
+    if (formData.chat_template) {
+      try {
+        nunjucks.configure({ autoescape: false })
+        const test_res = nunjucks.renderString(formData.chat_template, {
+          messages: messages,
+        })
+        if (test_res === '') {
+          setTestRes(test_res)
+          setTestErrorInfo('error')
+          setIsTestSuccess(false)
+        } else {
+          setTestRes(test_res)
+          setTestErrorInfo('')
+          setIsTestSuccess(true)
+        }
+      } catch (error) {
+        setTestErrorInfo(`${error}`)
+        setIsTestSuccess(false)
+      }
+    }
+  }
+
+  const getStopTokenIds = (value) => {
+    if (value.length === 1 && value[0] === '') {
+      setFormData({
+        ...formData,
+        stop_token_ids: [],
+      })
+    } else {
+      setFormData({
+        ...formData,
+        stop_token_ids: value,
+      })
+    }
+  }
+
+  const getStop = (value) => {
+    if (value.length === 1 && value[0] === '') {
+      setFormData({
+        ...formData,
+        stop: [],
+      })
+    } else {
+      setFormData({
+        ...formData,
+        stop: value,
+      })
+    }
+  }
+
   return (
     <Box style={{ display: 'flex', overFlow: 'hidden', maxWidth: '100%' }}>
       <div className="show-json">
@@ -845,66 +783,162 @@ const RegisterModelComponent = ({ modelType, customData }) => {
 
           {/* family */}
           {(customData.model_family === '' || customData.model_family) && (
-            <FormControl>
-              <label
-                style={{
-                  paddingLeft: 5,
-                  color: 'inherit',
-                }}
-              >
-                Model Family
-              </label>
-              {modelType === 'LLM' && formData.model_family && (
-                <Alert
-                  icon={<CheckIcon fontSize="inherit" />}
-                  severity="success"
-                >
-                  Please be careful to select the family name corresponding to
-                  the model you want to register. If not found, please choose
-                  <i style={{ fontStyle: 'italic', fontWeight: 700 }}> other</i>
-                  .
-                </Alert>
-              )}
-              {modelType === 'LLM' && !formData.model_family && (
-                <Alert severity="error">
-                  Please be careful to select the family name corresponding to
-                  the model you want to register. If not found, please choose
-                  <i style={{ fontStyle: 'italic', fontWeight: 700 }}> other</i>
-                  .
-                </Alert>
+            <>
+              {modelType === 'LLM' && (
+                <>
+                  <TextField
+                    label="Model Family"
+                    error={formData.model_family ? false : true}
+                    value={formData.model_family}
+                    size="small"
+                    onChange={(event) =>
+                      setFormData({
+                        ...formData,
+                        model_family: event.target.value,
+                      })
+                    }
+                  />
+                  <Box padding="15px"></Box>
+                </>
               )}
-              <RadioGroup
-                value={formData.model_family}
-                onChange={(e) => {
-                  toggleFamily(e.target.value)
-                }}
-              >
-                <Box
-                  className="checkboxWrapper"
-                  style={{ paddingLeft: '10px' }}
-                >
-                  {modelType === 'LLM' &&
-                    sortStringsByFirstLetter(getFamilyByAbility()).map((v) => (
-                      <Box sx={{ width: '20%' }} key={v}>
+              {(modelType === 'image' || modelType === 'audio') && (
+                <>
+                  <FormControl>
+                    <label
+                      style={{
+                        paddingLeft: 5,
+                        color: 'inherit',
+                      }}
+                    >
+                      Model Family
+                    </label>
+                    <RadioGroup value={formData.model_family}>
+                      <Box
+                        className="checkboxWrapper"
+                        style={{ paddingLeft: '10px' }}
+                      >
                         <FormControlLabel
-                          value={v}
+                          value={formData.model_family}
+                          checked
                           control={<Radio />}
-                          label={v}
+                          label={formData.model_family}
                         />
                       </Box>
-                    ))}
-                  {(modelType === 'image' || modelType === 'audio') && (
-                    <FormControlLabel
-                      value={formData.model_family}
-                      checked
-                      control={<Radio />}
-                      label={formData.model_family}
+                    </RadioGroup>
+                  </FormControl>
+                  <Box padding="15px"></Box>
+                </>
+              )}
+            </>
+          )}
+
+          {/* chat_template */}
+          {formData.model_ability?.includes('chat') && (
+            <>
+              <div className="chat_template_box">
+                <TextField
+                  label="Chat Template"
+                  error={formData.chat_template ? false : true}
+                  value={formData.chat_template}
+                  size="small"
+                  multiline
+                  rows={6}
+                  onChange={(event) =>
+                    setFormData({
+                      ...formData,
+                      chat_template: event.target.value,
+                    })
+                  }
+                  style={{ flex: 1 }}
+                />
+                <Button variant="contained" onClick={handleTest}>
+                  test
+                </Button>
+                <div className="chat_template_test">
+                  <div
+                    style={{ display: 'flex', alignItems: 'center', gap: 5 }}
+                  >
+                    <span style={{ fontSize: 16 }}>messages example</span>
+                    <OpenInFullIcon
+                      onClick={() => setIsOpenMessages(true)}
+                      style={{ fontSize: 14, color: '#666', cursor: 'pointer' }}
                     />
-                  )}
-                </Box>
-              </RadioGroup>
+                  </div>
+                  <div>
+                    <span
+                      style={{
+                        display: 'flex',
+                        alignItems: 'center',
+                        gap: 10,
+                        fontSize: 16,
+                      }}
+                    >
+                      test result
+                      {testErrorInfo ? (
+                        <Cancel style={{ color: 'red' }} />
+                      ) : testRes ? (
+                        <CheckCircleIcon
+                          style={{ color: 'rgb(46, 125, 50)' }}
+                        />
+                      ) : (
+                        ''
+                      )}
+                    </span>
+                    <div
+                      className="test_res_box"
+                      style={{
+                        backgroundColor:
+                          testErrorInfo === ''
+                            ? testRes
+                              ? 'rgb(237, 247, 237)'
+                              : ''
+                            : 'rgb(253, 237, 237)',
+                      }}
+                    >
+                      {testErrorInfo !== ''
+                        ? testErrorInfo
+                        : testRes
+                        ? testRes
+                        : 'No test results...'}
+                    </div>
+                  </div>
+                </div>
+              </div>
+              <Box padding="15px"></Box>
+            </>
+          )}
+
+          {/* stop_token_ids */}
+          {formData.model_ability?.includes('chat') && (
+            <>
+              <AddStop
+                label="Stop Token Ids"
+                arrItemType="number"
+                formData={formData.stop_token_ids}
+                onGetData={getStopTokenIds}
+                onGetError={(value) => {
+                  if (value.includes('false')) {
+                    setIsStopTokenIdsAlert(true)
+                  } else {
+                    setIsStopTokenIdsAlert(false)
+                  }
+                }}
+              />
               <Box padding="15px"></Box>
-            </FormControl>
+            </>
+          )}
+
+          {/* stop */}
+          {formData.model_ability?.includes('chat') && (
+            <>
+              <AddStop
+                label="Stop"
+                arrItemType="string"
+                formData={formData.stop}
+                onGetData={getStop}
+              />
+              <Box padding="15px"></Box>
+            </>
           )}
 
           {/* specs */}
@@ -1011,6 +1045,21 @@ const RegisterModelComponent = ({ modelType, customData }) => {
               color="primary"
               type="submit"
               onClick={handleClick}
+              disabled={
+                isContextLengthAlert ||
+                isDimensionsAlert ||
+                isMaxTokensAlert ||
+                formData.model_lang?.length === 0 ||
+                formData.language?.length === 0 ||
+                formData.model_ability?.length === 0 ||
+                (modelType === 'LLM' && !formData.model_family) ||
+                (formData.model_ability?.includes('chat') &&
+                  !formData.chat_template) ||
+                (formData.model_ability?.includes('chat') &&
+                  formData.chat_template &&
+                  !isTestSuccess) ||
+                isStopTokenIdsAlert
+              }
             >
               Register Model
             </Button>
@@ -1018,6 +1067,32 @@ const RegisterModelComponent = ({ modelType, customData }) => {
         )}
       </div>
 
+      <Dialog
+        open={isOpenMessages}
+        onClose={() => setIsOpenMessages(false)}
+        aria-labelledby="alert-dialog-title"
+        aria-describedby="alert-dialog-description"
+      >
+        <DialogTitle id="alert-dialog-title">Messages Example</DialogTitle>
+        <DialogContent>
+          <textarea
+            readOnly
+            className="textarea"
+            style={{ width: 500, height: 200 }}
+            value={JSON.stringify(messages, null, 4)}
+          />
+        </DialogContent>
+        <DialogActions>
+          <Button
+            variant="contained"
+            onClick={() => setIsOpenMessages(false)}
+            style={{ marginRight: 15, marginBottom: 15 }}
+          >
+            OK
+          </Button>
+        </DialogActions>
+      </Dialog>
+
       {/* JSON */}
       <div className={isShow ? 'jsonBox' : 'jsonBox hide'}>
         <div className="jsonBox-header">
diff --git a/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css b/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css
index e7d8b9fd68..7d4c167bb9 100644
--- a/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css
+++ b/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css
@@ -119,3 +119,26 @@
   font-size: 28px !important;
   color: #fff;
 }
+
+.chat_template_box {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+}
+
+.chat_template_test {
+  height: 137px;
+  width: 30%;
+  padding: 10px;
+  border: 1px solid #ccc;
+  border-radius: 4px;
+  overflow: scroll;
+}
+
+.test_res_box {
+  background-color: #eee;
+  min-height: 55px;
+  padding: 10px;
+  margin-top: 5px;
+  border-radius: 4px;
+}

From a2b716de45f104b994907ffe67c1ce1bfe5f4fd2 Mon Sep 17 00:00:00 2001
From: yiboyasss <3359595624@qq.com>
Date: Wed, 4 Sep 2024 19:11:44 +0800
Subject: [PATCH 10/15] fix: detail

---
 .../components/addModelSpecs.js               |  4 +-
 .../register_model/components/addStop.js      |  5 +-
 .../web/ui/src/scenes/register_model/index.js |  2 +-
 .../scenes/register_model/registerModel.js    | 63 +++++++++++++++++--
 4 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/xinference/web/ui/src/scenes/register_model/components/addModelSpecs.js b/xinference/web/ui/src/scenes/register_model/components/addModelSpecs.js
index df4120b932..9d21e5005a 100644
--- a/xinference/web/ui/src/scenes/register_model/components/addModelSpecs.js
+++ b/xinference/web/ui/src/scenes/register_model/components/addModelSpecs.js
@@ -245,8 +245,8 @@ const AddModelSpecs = ({
 
   return (
     <>
-      <div>
-        <label style={{ marginBottom: '20px' }}>Model Specs</label>
+      <div style={{ display: 'flex', alignItems: 'center', marginBottom: 10 }}>
+        <label style={{ width: '100px' }}>Model Specs</label>
         <Button
           variant="contained"
           size="small"
diff --git a/xinference/web/ui/src/scenes/register_model/components/addStop.js b/xinference/web/ui/src/scenes/register_model/components/addStop.js
index 0acca09981..9014bd8c82 100644
--- a/xinference/web/ui/src/scenes/register_model/components/addStop.js
+++ b/xinference/web/ui/src/scenes/register_model/components/addStop.js
@@ -5,7 +5,7 @@ import React, { useEffect, useState } from 'react'
 
 const regex = /^[1-9]\d*$/
 
-const AddStop = ({ label, onGetData, arrItemType, formData, onGetError }) => {
+const AddStop = ({ label, onGetData, arrItemType, formData, onGetError, helperText }) => {
   const [dataArr, setDataArr] = useState(formData?.length ? formData : [''])
   const arr = []
 
@@ -52,7 +52,7 @@ const AddStop = ({ label, onGetData, arrItemType, formData, onGetError }) => {
         <div
           style={{ display: 'flex', alignItems: 'center', marginBottom: 10 }}
         >
-          <label>{label}</label>
+          <label style={{width: '100px'}}>{label}</label>
           <Button
             variant="contained"
             size="small"
@@ -80,6 +80,7 @@ const AddStop = ({ label, onGetData, arrItemType, formData, onGetError }) => {
                 <TextField
                   value={item}
                   onChange={(e) => handleChange(e.target.value, index)}
+                  label={helperText}
                   size="small"
                   style={{ width: '100%' }}
                 />
diff --git a/xinference/web/ui/src/scenes/register_model/index.js b/xinference/web/ui/src/scenes/register_model/index.js
index 6aa0146bc9..be6d4bad4c 100644
--- a/xinference/web/ui/src/scenes/register_model/index.js
+++ b/xinference/web/ui/src/scenes/register_model/index.js
@@ -71,7 +71,7 @@ const RegisterModel = () => {
                   quantizations: ['none'],
                 },
               ],
-              model_family: '',
+              model_family: 'your_custom_model',
             }}
           />
         </TabPanel>
diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js
index 717587d6d6..8b10180291 100644
--- a/xinference/web/ui/src/scenes/register_model/registerModel.js
+++ b/xinference/web/ui/src/scenes/register_model/registerModel.js
@@ -41,7 +41,7 @@ import AddModelSpecs from './components/addModelSpecs'
 import AddStop from './components/addStop'
 import languages from './data/languages'
 const SUPPORTED_LANGUAGES_DICT = { en: 'English', zh: 'Chinese' }
-const SUPPORTED_FEATURES = ['Generate', 'Chat', 'Vision']
+const SUPPORTED_FEATURES = ['Generate', 'Chat',]
 const messages = [
   {
     role: 'assistant',
@@ -57,8 +57,10 @@ const messages = [
 const SUPPORTED_LANGUAGES = Object.keys(SUPPORTED_LANGUAGES_DICT)
 
 const RegisterModelComponent = ({ modelType, customData }) => {
+  const endPoint = useContext(ApiContext).endPoint
   const { setErrorMsg } = useContext(ApiContext)
   const [formData, setFormData] = useState(customData)
+  const [family, setFamily] = useState([])
   const [languagesArr, setLanguagesArr] = useState([])
   const [isContextLengthAlert, setIsContextLengthAlert] = useState(false)
   const [isDimensionsAlert, setIsDimensionsAlert] = useState(false)
@@ -238,15 +240,57 @@ const RegisterModelComponent = ({ modelType, customData }) => {
       navigate('/login', { replace: true })
       return
     }
+
+    const getBuiltinFamilies = async () => {
+      const response = await fetch(endPoint + '/v1/models/families', {
+        method: 'GET',
+        headers: {
+          'Content-Type': 'application/json',
+        },
+      })
+      if (!response.ok) {
+        const errorData = await response.json() // Assuming the server returns error details in JSON format
+        setErrorMsg(
+          `Server error: ${response.status} - ${
+            errorData.detail || 'Unknown error'
+          }`
+        )
+      } else {
+        const data = await response.json()
+        setFamily([...data.chat, ...data.generate])
+      }
+    }
+
+    if (
+      Object.prototype.hasOwnProperty.call(customData, 'model_ability') &&
+      Object.prototype.hasOwnProperty.call(customData, 'model_family')
+    ) {
+      if (family.length === 0) {
+        getBuiltinFamilies().catch((error) => {
+          setErrorMsg(
+            error.message ||
+              'An unexpected error occurred when getting builtin prompt styles.'
+          )
+          console.error('Error: ', error)
+        })
+      }
+    }
   }, [cookie.token])
 
   useEffect(() => {
-    setJsonData(JSON.stringify(formData, null, 4))
+    setJsonData(JSON.stringify(formData, customReplacer, 4))
     if (contrastObj.model_name) {
       deepEqual(contrastObj, formData) ? setIsEqual(true) : setIsEqual(false)
     }
   }, [formData])
 
+  const customReplacer = (key, value) => {
+    if (key === 'chat_template') {
+      return value.replace(/\\n/g, '\n');
+    }
+    return value;
+  }
+
   const handleClick = async () => {
     for (let key in formData) {
       const type = Object.prototype.toString.call(formData[key]).slice(8, -1)
@@ -254,6 +298,8 @@ const RegisterModelComponent = ({ modelType, customData }) => {
         key !== 'model_description' &&
         ((type === 'Array' &&
           key !== 'controlnet' &&
+          key !== 'stop_token_ids' &&
+          key !== 'stop' &&
           formData[key].length === 0) ||
           (type === 'String' && formData[key] === '') ||
           (type === 'Number' && formData[key] <= 0))
@@ -276,7 +322,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
     try {
       fetchWrapper
         .post(`/v1/model_registrations/${modelType}`, {
-          model: JSON.stringify(formData),
+          model: JSON.stringify(formData, customReplacer),
           persist: true,
         })
         .then(() => {
@@ -790,6 +836,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                     label="Model Family"
                     error={formData.model_family ? false : true}
                     value={formData.model_family}
+                    helperText="Not the same as the built-in model name."
                     size="small"
                     onChange={(event) =>
                       setFormData({
@@ -798,6 +845,11 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                       })
                     }
                   />
+                  {family.includes(formData.model_family) && (
+                    <Alert severity="error">
+                      Custom model has the same name as a built-in model, please change it.
+                    </Alert>
+                  )}
                   <Box padding="15px"></Box>
                 </>
               )}
@@ -923,6 +975,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                     setIsStopTokenIdsAlert(false)
                   }
                 }}
+                helperText='int type, used to control the stopping of chat models'
               />
               <Box padding="15px"></Box>
             </>
@@ -936,6 +989,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                 arrItemType="string"
                 formData={formData.stop}
                 onGetData={getStop}
+                helperText='string type, used to control the stopping of chat models'
               />
               <Box padding="15px"></Box>
             </>
@@ -1058,7 +1112,8 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                 (formData.model_ability?.includes('chat') &&
                   formData.chat_template &&
                   !isTestSuccess) ||
-                isStopTokenIdsAlert
+                isStopTokenIdsAlert ||
+                family?.includes(formData?.model_family)
               }
             >
               Register Model

From 1d43af013e44433fb48159c97eb6b4866af7c6da Mon Sep 17 00:00:00 2001
From: yiboyasss <3359595624@qq.com>
Date: Wed, 4 Sep 2024 19:12:35 +0800
Subject: [PATCH 11/15] fix: detail

---
 .../src/scenes/register_model/components/addStop.js | 11 +++++++++--
 .../ui/src/scenes/register_model/registerModel.js   | 13 +++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/xinference/web/ui/src/scenes/register_model/components/addStop.js b/xinference/web/ui/src/scenes/register_model/components/addStop.js
index 9014bd8c82..daf1e0923f 100644
--- a/xinference/web/ui/src/scenes/register_model/components/addStop.js
+++ b/xinference/web/ui/src/scenes/register_model/components/addStop.js
@@ -5,7 +5,14 @@ import React, { useEffect, useState } from 'react'
 
 const regex = /^[1-9]\d*$/
 
-const AddStop = ({ label, onGetData, arrItemType, formData, onGetError, helperText }) => {
+const AddStop = ({
+  label,
+  onGetData,
+  arrItemType,
+  formData,
+  onGetError,
+  helperText,
+}) => {
   const [dataArr, setDataArr] = useState(formData?.length ? formData : [''])
   const arr = []
 
@@ -52,7 +59,7 @@ const AddStop = ({ label, onGetData, arrItemType, formData, onGetError, helperTe
         <div
           style={{ display: 'flex', alignItems: 'center', marginBottom: 10 }}
         >
-          <label style={{width: '100px'}}>{label}</label>
+          <label style={{ width: '100px' }}>{label}</label>
           <Button
             variant="contained"
             size="small"
diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js
index 8b10180291..49c56d456d 100644
--- a/xinference/web/ui/src/scenes/register_model/registerModel.js
+++ b/xinference/web/ui/src/scenes/register_model/registerModel.js
@@ -41,7 +41,7 @@ import AddModelSpecs from './components/addModelSpecs'
 import AddStop from './components/addStop'
 import languages from './data/languages'
 const SUPPORTED_LANGUAGES_DICT = { en: 'English', zh: 'Chinese' }
-const SUPPORTED_FEATURES = ['Generate', 'Chat',]
+const SUPPORTED_FEATURES = ['Generate', 'Chat']
 const messages = [
   {
     role: 'assistant',
@@ -286,9 +286,9 @@ const RegisterModelComponent = ({ modelType, customData }) => {
 
   const customReplacer = (key, value) => {
     if (key === 'chat_template') {
-      return value.replace(/\\n/g, '\n');
+      return value.replace(/\\n/g, '\n')
     }
-    return value;
+    return value
   }
 
   const handleClick = async () => {
@@ -847,7 +847,8 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                   />
                   {family.includes(formData.model_family) && (
                     <Alert severity="error">
-                      Custom model has the same name as a built-in model, please change it.
+                      Custom model has the same name as a built-in model, please
+                      change it.
                     </Alert>
                   )}
                   <Box padding="15px"></Box>
@@ -975,7 +976,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                     setIsStopTokenIdsAlert(false)
                   }
                 }}
-                helperText='int type, used to control the stopping of chat models'
+                helperText="int type, used to control the stopping of chat models"
               />
               <Box padding="15px"></Box>
             </>
@@ -989,7 +990,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                 arrItemType="string"
                 formData={formData.stop}
                 onGetData={getStop}
-                helperText='string type, used to control the stopping of chat models'
+                helperText="string type, used to control the stopping of chat models"
               />
               <Box padding="15px"></Box>
             </>

From adac1ba16e723a02049ff9bfbee01d05072ceec4 Mon Sep 17 00:00:00 2001
From: yiboyasss <3359595624@qq.com>
Date: Wed, 4 Sep 2024 19:47:22 +0800
Subject: [PATCH 12/15] fix: detail

---
 .../register_model/components/addStop.js      |   4 +-
 .../scenes/register_model/registerModel.js    | 113 ++++++++++--------
 .../styles/registerModelStyle.css             |  13 +-
 3 files changed, 74 insertions(+), 56 deletions(-)

diff --git a/xinference/web/ui/src/scenes/register_model/components/addStop.js b/xinference/web/ui/src/scenes/register_model/components/addStop.js
index daf1e0923f..393c43b76e 100644
--- a/xinference/web/ui/src/scenes/register_model/components/addStop.js
+++ b/xinference/web/ui/src/scenes/register_model/components/addStop.js
@@ -100,9 +100,7 @@ const AddStop = ({
               </div>
 
               {handleShowAlert(item) && (
-                <Alert severity="error">
-                  Please enter an integer greater than 0.
-                </Alert>
+                <Alert severity="error">Please enter an integer.</Alert>
               )}
             </div>
           ))}
diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js
index 49c56d456d..98a818cae4 100644
--- a/xinference/web/ui/src/scenes/register_model/registerModel.js
+++ b/xinference/web/ui/src/scenes/register_model/registerModel.js
@@ -90,7 +90,6 @@ const RegisterModelComponent = ({ modelType, customData }) => {
   const [testRes, setTestRes] = useState('')
   const [isOpenMessages, setIsOpenMessages] = useState(false)
   const [testErrorInfo, setTestErrorInfo] = useState('')
-  const [isTestSuccess, setIsTestSuccess] = useState(false)
   const [isStopTokenIdsAlert, setIsStopTokenIdsAlert] = useState(false)
 
   useEffect(() => {
@@ -512,15 +511,12 @@ const RegisterModelComponent = ({ modelType, customData }) => {
         if (test_res === '') {
           setTestRes(test_res)
           setTestErrorInfo('error')
-          setIsTestSuccess(false)
         } else {
           setTestRes(test_res)
           setTestErrorInfo('')
-          setIsTestSuccess(true)
         }
       } catch (error) {
         setTestErrorInfo(`${error}`)
-        setIsTestSuccess(false)
       }
     }
   }
@@ -894,6 +890,7 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                   error={formData.chat_template ? false : true}
                   value={formData.chat_template}
                   size="small"
+                  helperText="Please make sure this chat_template passes the test by clicking the TEST button on the right. Please note that this test may not cover all cases and will only be used for the most basic case."
                   multiline
                   rows={6}
                   onChange={(event) =>
@@ -904,57 +901,74 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                   }
                   style={{ flex: 1 }}
                 />
-                <Button variant="contained" onClick={handleTest}>
+                <Button
+                  variant="contained"
+                  onClick={handleTest}
+                  style={{ marginTop: 50 }}
+                >
                   test
                 </Button>
                 <div className="chat_template_test">
-                  <div
-                    style={{ display: 'flex', alignItems: 'center', gap: 5 }}
-                  >
-                    <span style={{ fontSize: 16 }}>messages example</span>
-                    <OpenInFullIcon
-                      onClick={() => setIsOpenMessages(true)}
-                      style={{ fontSize: 14, color: '#666', cursor: 'pointer' }}
-                    />
-                  </div>
-                  <div>
-                    <span
-                      style={{
-                        display: 'flex',
-                        alignItems: 'center',
-                        gap: 10,
-                        fontSize: 16,
-                      }}
-                    >
-                      test result
-                      {testErrorInfo ? (
-                        <Cancel style={{ color: 'red' }} />
-                      ) : testRes ? (
-                        <CheckCircleIcon
-                          style={{ color: 'rgb(46, 125, 50)' }}
-                        />
-                      ) : (
-                        ''
-                      )}
-                    </span>
+                  <div className="chat_template_test_mainBox">
                     <div
-                      className="test_res_box"
-                      style={{
-                        backgroundColor:
-                          testErrorInfo === ''
-                            ? testRes
-                              ? 'rgb(237, 247, 237)'
-                              : ''
-                            : 'rgb(253, 237, 237)',
-                      }}
+                      style={{ display: 'flex', alignItems: 'center', gap: 5 }}
                     >
-                      {testErrorInfo !== ''
-                        ? testErrorInfo
-                        : testRes
-                        ? testRes
-                        : 'No test results...'}
+                      <span style={{ fontSize: 16 }}>messages example</span>
+                      <OpenInFullIcon
+                        onClick={() => setIsOpenMessages(true)}
+                        style={{
+                          fontSize: 14,
+                          color: '#666',
+                          cursor: 'pointer',
+                        }}
+                      />
+                    </div>
+                    <div>
+                      <span
+                        style={{
+                          display: 'flex',
+                          alignItems: 'center',
+                          gap: 10,
+                          fontSize: 16,
+                        }}
+                      >
+                        test result
+                        {testErrorInfo ? (
+                          <Cancel style={{ color: 'red' }} />
+                        ) : testRes ? (
+                          <CheckCircleIcon
+                            style={{ color: 'rgb(46, 125, 50)' }}
+                          />
+                        ) : (
+                          ''
+                        )}
+                      </span>
+                      <div
+                        className="test_res_box"
+                        style={{
+                          backgroundColor:
+                            testErrorInfo === ''
+                              ? testRes
+                                ? 'rgb(237, 247, 237)'
+                                : ''
+                              : 'rgb(253, 237, 237)',
+                        }}
+                      >
+                        {testErrorInfo !== ''
+                          ? testErrorInfo
+                          : testRes
+                          ? testRes
+                          : 'No test results...'}
+                      </div>
                     </div>
                   </div>
+                  <div
+                    className="chat_template_test_tip"
+                    style={{ color: testErrorInfo === '' ? '' : '#d32f2f' }}
+                  >
+                    Please note that failure to pass test may prevent chats from
+                    working properly.
+                  </div>
                 </div>
               </div>
               <Box padding="15px"></Box>
@@ -1110,9 +1124,6 @@ const RegisterModelComponent = ({ modelType, customData }) => {
                 (modelType === 'LLM' && !formData.model_family) ||
                 (formData.model_ability?.includes('chat') &&
                   !formData.chat_template) ||
-                (formData.model_ability?.includes('chat') &&
-                  formData.chat_template &&
-                  !isTestSuccess) ||
                 isStopTokenIdsAlert ||
                 family?.includes(formData?.model_family)
               }
diff --git a/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css b/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css
index 7d4c167bb9..a0922cc745 100644
--- a/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css
+++ b/xinference/web/ui/src/scenes/register_model/styles/registerModelStyle.css
@@ -122,19 +122,28 @@
 
 .chat_template_box {
   display: flex;
-  align-items: center;
+  align-items: start;
   gap: 10px;
 }
 
 .chat_template_test {
-  height: 137px;
   width: 30%;
+}
+
+.chat_template_test_mainBox {
+  height: 137px;
   padding: 10px;
   border: 1px solid #ccc;
   border-radius: 4px;
   overflow: scroll;
 }
 
+.chat_template_test_tip {
+  font-size: 10px;
+  margin: 4px 14px 0;
+  color: rgba(0, 0, 0, 0.6);
+}
+
 .test_res_box {
   background-color: #eee;
   min-height: 55px;

From 8f3cd34844777da2ff94b25b51d52c1480b0e6a0 Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Thu, 5 Sep 2024 12:30:23 +0800
Subject: [PATCH 13/15] fix doc

---
 doc/source/getting_started/using_xinference.rst | 10 +++++-----
 doc/source/index.rst                            |  5 ++---
 doc/source/models/lora.rst                      |  3 +--
 doc/source/models/model_abilities/chat.rst      | 13 +++++++------
 doc/source/user_guide/client_api.rst            |  6 ++----
 doc/source/user_guide/continuous_batching.rst   |  2 +-
 6 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/doc/source/getting_started/using_xinference.rst b/doc/source/getting_started/using_xinference.rst
index b8cc47458a..af8071b3b6 100644
--- a/doc/source/getting_started/using_xinference.rst
+++ b/doc/source/getting_started/using_xinference.rst
@@ -243,11 +243,11 @@ or via Xinference's python client:
     from xinference.client import RESTfulClient
     client = RESTfulClient("http://127.0.0.1:9997")
     model = client.get_model("my-llama-2")
-    print(model.chat(
-        prompt="What is the largest animal?",
-        system_prompt="You are a helpful assistant.",
-        chat_history=[]
-    ))
+    model.chat(
+        messages=[
+            {"role": "user", "content": "Who won the world series in 2020?"}
+        ]
+    )
 
   .. code-tab:: json output
 
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 926cb8dca0..270f71e565 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -35,14 +35,13 @@ Developing Real-world AI Applications with Xinference
 
     # Chat to LLM
     model.chat(
-       prompt="What is the largest animal?",
-       system_prompt="You are a helpful assistant",
+       messages=[{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "What is the largest animal?"}],
        generate_config={"max_tokens": 1024}
     )
     
     # Chat to VL model
     model.chat(
-       chat_history=[
+       messages=[
          {
             "role": "user",
             "content": [
diff --git a/doc/source/models/lora.rst b/doc/source/models/lora.rst
index a82a7717da..16e4de3c4f 100644
--- a/doc/source/models/lora.rst
+++ b/doc/source/models/lora.rst
@@ -65,8 +65,7 @@ Specifically, specify that the ``lora_name`` parameter be configured in the ``ge
     client = Client("http://<XINFERENCE_HOST>:<XINFERENCE_PORT>")
     model = client.get_model("<model_uid>")
     model.chat(
-        "<prompt>",
-        <other_options>,
+        messages=[{"role": "user", "content": "<prompt>"}],
         generate_config={"lora_name": "<your_lora_name>"}
     )
 
diff --git a/doc/source/models/model_abilities/chat.rst b/doc/source/models/model_abilities/chat.rst
index 35db451348..608c3faf4a 100644
--- a/doc/source/models/model_abilities/chat.rst
+++ b/doc/source/models/model_abilities/chat.rst
@@ -108,15 +108,14 @@ We can try Chat API out either via cURL, OpenAI Client, or Xinference's python c
 
     client = RESTfulClient("http://<XINFERENCE_HOST>:<XINFERENCE_PORT>")
     model = client.get_model("<MODEL_UID>")
-    print(model.chat(
-        prompt="What is the largest animal?",
-        system_prompt="You are a helpful assistant.",
-        chat_history=[],
+    messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the largest animal?"}]
+    model.chat(
+        messages,
         generate_config={
           "max_tokens": 512,
           "temperature": 0.7
         }        
-    ))
+    )
 
   .. code-tab:: json output
 
@@ -183,7 +182,9 @@ a list of messages as input, the Generate API accepts a freeform text string nam
     client = openai.Client(api_key="cannot be empty", base_url="http://<XINFERENCE_HOST>:<XINFERENCE_PORT>/v1")
     client.chat.completions.create(
         model=("<MODEL_UID>",
-        prompt="What is the largest animal?"
+        messages=[
+            {"role": "user", "content": "What is the largest animal?"}
+        ],
         max_tokens=512,
         temperature=0.7
     )
diff --git a/doc/source/user_guide/client_api.rst b/doc/source/user_guide/client_api.rst
index bd47822b7e..ab4b9b1afe 100644
--- a/doc/source/user_guide/client_api.rst
+++ b/doc/source/user_guide/client_api.rst
@@ -53,13 +53,11 @@ Xinference Client
                                     quantization="Q4_K")
     model = client.get_model(model_uid)
 
-    chat_history = []
-    prompt = "What is the largest animal?"
+    messages = [{"role": "user", "content": "What is the largest animal?"}]
     # If the model has "generate" capability, then you can call the
     # model.generate API.
     model.chat(
-        prompt,
-        chat_history=chat_history,
+        messages,
         generate_config={"max_tokens": 1024}
     )
 
diff --git a/doc/source/user_guide/continuous_batching.rst b/doc/source/user_guide/continuous_batching.rst
index 47269fbd0a..eedae61a8d 100644
--- a/doc/source/user_guide/continuous_batching.rst
+++ b/doc/source/user_guide/continuous_batching.rst
@@ -70,7 +70,7 @@ In this mode, you can abort requests that are in the process of inference.
     from xinference.client import Client
     client = Client("http://127.0.0.1:9997")
     model = client.get_model("<model_uid>")
-    model.chat("<prompt>", generate_config={"request_id": "<your_unique_request_id>"})
+    model.chat([{"role": "user", "content": "<prompt>"}], generate_config={"request_id": "<your_unique_request_id>"})
 
 #. Then, abort the request using the ``request_id`` you have set. For example:
 

From 96f87be5689347cd25cb1d3b35be1fd06334009f Mon Sep 17 00:00:00 2001
From: ChengjieLi <chengjieli23@outlook.com>
Date: Thu, 5 Sep 2024 13:16:25 +0800
Subject: [PATCH 14/15] fix register doc

---
 .../locale/zh_CN/LC_MESSAGES/models/custom.po | 162 ++++++++----------
 doc/source/models/custom.rst                  | 124 ++------------
 2 files changed, 82 insertions(+), 204 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po b/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
index 03a7e356cd..196564293a 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
@@ -7,7 +7,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Xinference \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-08-15 11:39+0800\n"
+"POT-Creation-Date: 2024-09-05 13:08+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -16,7 +16,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.11.0\n"
+"Generated-By: Babel 2.16.0\n"
 
 #: ../../source/models/custom.rst:5
 msgid "Custom Models"
@@ -70,8 +70,8 @@ msgstr "定义自定义大语言模型"
 msgid "Define a custom LLM model based on the following template:"
 msgstr "基于以下模板定义一个自定义大语言模型："
 
-#: ../../source/models/custom.rst:96 ../../source/models/custom.rst:225
-#: ../../source/models/custom.rst:248
+#: ../../source/models/custom.rst:95 ../../source/models/custom.rst:127
+#: ../../source/models/custom.rst:150
 msgid ""
 "model_name: A string defining the name of the model. The name must start "
 "with a letter or a digit and can only contain letters, digits, "
@@ -80,7 +80,7 @@ msgstr ""
 "model_name: 模型名称。名称必须以字母或数字开头，且只能包含字母、数字、"
 "下划线或短划线。"
 
-#: ../../source/models/custom.rst:97
+#: ../../source/models/custom.rst:96
 msgid ""
 "context_length: context_length: An optional integer that specifies the "
 "maximum context size the model was trained to accommodate, encompassing "
@@ -90,7 +90,7 @@ msgstr ""
 "context_length: 一个可选的整数，模型支持的最大上下文长度，包括输入和输出"
 "长度。如果未定义，默认值为2048个token（约1,500个词）。"
 
-#: ../../source/models/custom.rst:98
+#: ../../source/models/custom.rst:97
 msgid ""
 "model_lang: A list of strings representing the supported languages for "
 "the model. Example: [\"en\"], which means that the model supports "
@@ -99,7 +99,7 @@ msgstr ""
 "model_lang: 一个字符串列表，表示模型支持的语言。例如：['en']，表示该模型"
 "支持英语。"
 
-#: ../../source/models/custom.rst:99
+#: ../../source/models/custom.rst:98
 msgid ""
 "model_ability: A list of strings defining the abilities of the model. It "
 "could include options like \"embed\", \"generate\", and \"chat\". In this"
@@ -108,40 +108,32 @@ msgstr ""
 "model_ability: 一个字符串列表，定义模型的能力。它可以包括像 'embed'、'"
 "generate' 和 'chat' 这样的选项。示例表示模型具有 'generate' 的能力。"
 
-#: ../../source/models/custom.rst:100
+#: ../../source/models/custom.rst:99
 msgid ""
 "model_family: A required string representing the family of the model you "
-"want to register. The optional values are the model names of all :ref"
-":`built-in models <models_llm_index>`. If the model family you register "
-"is not among the built-in models in Xinference, please fill in ``other``."
-" Note that you should choose the model family based on the ability of the"
-" model you want to register. For example, if you want to register the "
-"``llama-2`` model, do not fill in ``llama-2-chat`` as the model family."
+"want to register. This parameter must not conflict with any builtin model"
+" names."
 msgstr ""
-"model_family: 必需字段，表示你要注册的模型的家族（类别）。可选值来自于 "
-"Xinference :ref:`所有内置模型的模型名 <models_llm_index>`。如果你要注册的"
-"模型不在其中，填入 ``other`` 。注意，此字段的值必须根据模型能力填入。例如"
-"，如果你注册的是自定义 ``llama-2`` 模型，千万不要填入 ``llama-2-chat`` 。"
 
-#: ../../source/models/custom.rst:106
+#: ../../source/models/custom.rst:100
 msgid ""
 "model_specs: An array of objects defining the specifications of the "
 "model. These include:"
 msgstr "model_specs: 一个包含定义模型规格的对象数组。这些规格包括："
 
-#: ../../source/models/custom.rst:102
+#: ../../source/models/custom.rst:101
 msgid ""
 "model_format: A string that defines the model format, like \"pytorch\" or"
 " \"ggufv2\"."
 msgstr "model_format: 一个定义模型格式的字符串，可以是 'pytorch' 或 'ggufv2'。"
 
-#: ../../source/models/custom.rst:103
+#: ../../source/models/custom.rst:102
 msgid ""
 "model_size_in_billions: An integer defining the size of the model in "
 "billions of parameters."
 msgstr "model_size_in_billions: 一个整数，定义模型的参数量，以十亿为单位。"
 
-#: ../../source/models/custom.rst:104
+#: ../../source/models/custom.rst:103
 msgid ""
 "quantizations: A list of strings defining the available quantizations for"
 " the model. For PyTorch models, it could be \"4-bit\", \"8-bit\", or "
@@ -152,7 +144,7 @@ msgstr ""
 "可以是 \"4-bit\"、\"8-bit\" 或 \"none\"。对于 ggufv2 模型，量化方式应与 `"
 "`model_file_name_template`` 中的值对应。"
 
-#: ../../source/models/custom.rst:105
+#: ../../source/models/custom.rst:104
 msgid ""
 "model_id: A string representing the model ID, possibly referring to an "
 "identifier used by Hugging Face. **If model_uri is missing, Xinference "
@@ -163,7 +155,7 @@ msgstr ""
 "如果 model_uri 字段缺失，Xinference 将尝试从此id指示的HuggingFace仓库下载"
 "该模型。"
 
-#: ../../source/models/custom.rst:106
+#: ../../source/models/custom.rst:105
 msgid ""
 "model_uri: A string representing the URI where the model can be loaded "
 "from, such as \"file:///path/to/llama-2-7b\". **When the model format is "
@@ -173,11 +165,11 @@ msgid ""
 "the model from Hugging Face with the model ID."
 msgstr ""
 "model_uri：表示模型文件位置的字符串，例如本地目录：\"file:///path/to/"
-"llama-2-7b\"。当 model_format 是 ggufv2 ，此字段必须是具体的"
-"模型文件路径。而当 model_format 是 pytorch 时，此字段必须是一个包含所有"
-"模型文件的目录。"
+"llama-2-7b\"。当 model_format 是 ggufv2 ，此字段必须是具体的模型文件路径"
+"。而当 model_format 是 pytorch 时，此字段必须是一个包含所有模型文件的目录"
+"。"
 
-#: ../../source/models/custom.rst:107
+#: ../../source/models/custom.rst:106
 msgid ""
 "model_file_name_template: Required by gguf models. An f-string template "
 "used for defining the model file name based on the quantization. **Note "
@@ -187,73 +179,57 @@ msgstr ""
 "model_file_name_template: gguf 模型所需。一个 f-string 模板，用于根据量化"
 "定义模型文件名。注意，这里不要填入文件的路径。"
 
-#: ../../source/models/custom.rst:108
+#: ../../source/models/custom.rst:107
 msgid ""
-"prompt_style: If the ``model_family`` field is not ``other``, this field "
-"does not need to be filled in. ``prompt_style`` is an optional field that"
-" could be required by ``chat`` models to define the style of prompts. The"
-" given example has this set to None, but additional details could be "
-"found in a referenced file xinference/model/llm/tests/test_utils.py. You "
-"can also specify this field as a string, which will use the builtin "
-"prompt style in Xinference. For example:"
-msgstr ""
-"prompt_style: 如果上述 ``model_family`` 字段不是 ``other`` ，则无需设置"
-"此字段。 ``prompt_style`` 是一个可选字段，表示 ``chat`` 模型需要的提示词"
-"样式。给定的示例将其设置为 None，但可以在引用的文件 xinference/model/llm/"
-"tests/test_utils.py 中找到更多详细信息。你也可以指定一个字符串，以使用"
-"内置模型的提示词样式。"
-
-#: ../../source/models/custom.rst:117
-msgid "Xinference supports these builtin prompt styles in common usage:"
-msgstr "Xinference 支持这些内置、常用的提示词样式："
-
-#: ../../source/models/custom.rst:121
-msgid "baichuan-chat"
-msgstr ""
-
-#: ../../source/models/custom.rst:140
-msgid "chatglm3"
-msgstr ""
-
-#: ../../source/models/custom.rst:153
-msgid "qwen-chat"
-msgstr ""
-
-#: ../../source/models/custom.rst:170
-msgid "llama-2-chat"
+"chat_template: If ``model_ability`` includes ``chat`` , you must "
+"configure this option to generate the correct full prompt during chat. "
+"This is a Jinja template string. Usually, you can find it in the "
+"``tokenizer_config.json`` file within the model directory."
 msgstr ""
+"如果 ``model_ability`` 中包含 ``chat`` ，那么此选项必须配置以生成合适的完整提示词。这是一个 Jinja 模版字符串。"
+"通常，你可以在模型目录的 ``tokenizer_config.json`` 文件中找到。"
 
-#: ../../source/models/custom.rst:191
-msgid "vicuna-v1.5"
+#: ../../source/models/custom.rst:108
+msgid ""
+"stop_token_ids: If ``model_ability`` includes ``chat`` , you can "
+"configure this option to control when the model stops during chat. This "
+"is a list of integers, and you can typically extract the corresponding "
+"values from the ``generation_config.json`` or ``tokenizer_config.json`` "
+"file in the model directory."
 msgstr ""
+"如果 ``model_ability`` 中包含 ``chat`` ，那么推荐配置此选项以合理控制对话的停止。这是一个包含整数的列表，你可以"
+"在模型目录的 ``generation_config.json`` 和 ``tokenizer_config.json`` 文件中提取相应的值。"
 
-#: ../../source/models/custom.rst:206
+#: ../../source/models/custom.rst:109
 msgid ""
-"The above lists some commonly used built-in prompt styles. The full list "
-"of supported prompt styles can be found on the Xinference web UI."
+"stop: If ``model_ability`` includes ``chat`` , you can configure this "
+"option to control when the model stops during chat. This is a list of "
+"strings, and you can typically extract the corresponding values from the "
+"``generation_config.json`` or ``tokenizer_config.json`` file in the model"
+" directory."
 msgstr ""
-"以上列举出了最常使用的提示词样式。完整的支持列表可以通过 Xinference 页面"
-"的 register model 面板查看。"
+"如果 ``model_ability`` 中包含 ``chat`` ，那么推荐配置此选项以合理控制对话的停止。这是一个包含字符串的列表，"
+"你可以在模型目录的 ``tokenizer_config.json`` 文件中找到 token 值对应的字符串。"
 
-#: ../../source/models/custom.rst:210
+#: ../../source/models/custom.rst:112
 msgid "Define a custom embedding model"
 msgstr "定义自定义 embedding 模型"
 
-#: ../../source/models/custom.rst:212
+#: ../../source/models/custom.rst:114
 msgid "Define a custom embedding model based on the following template:"
 msgstr "基于以下模板定义一个自定义 embedding 模型："
 
-#: ../../source/models/custom.rst:226
+#: ../../source/models/custom.rst:128
 msgid "dimensions: A integer that specifies the embedding dimensions."
 msgstr "dimensions: 表示 embedding 维度的整型值。"
 
-#: ../../source/models/custom.rst:227
+#: ../../source/models/custom.rst:129
 msgid ""
 "max_tokens: A integer that represents the max sequence length that the "
 "embedding model supports."
 msgstr "max_tokens: 表示 embedding 模型支持的最大输入序列长度的整型值。"
 
-#: ../../source/models/custom.rst:228 ../../source/models/custom.rst:250
+#: ../../source/models/custom.rst:130 ../../source/models/custom.rst:152
 msgid ""
 "language: A list of strings representing the supported languages for the "
 "model. Example: [\"en\"], which means that the model supports English."
@@ -261,7 +237,7 @@ msgstr ""
 "model_lang: 一个字符串列表，表示模型支持的语言。例如：['en']，表示该模型"
 "支持英语。"
 
-#: ../../source/models/custom.rst:229 ../../source/models/custom.rst:251
+#: ../../source/models/custom.rst:131 ../../source/models/custom.rst:153
 msgid ""
 "model_id: A string representing the model ID, possibly referring to an "
 "identifier used by Hugging Face."
@@ -269,7 +245,7 @@ msgstr ""
 "model_id: 一个表示模型标识的字符串，类似 HuggingFace 或 ModelScope 使用的"
 "标识符。"
 
-#: ../../source/models/custom.rst:230 ../../source/models/custom.rst:252
+#: ../../source/models/custom.rst:132 ../../source/models/custom.rst:154
 msgid ""
 "model_uri: A string representing the URI where the model can be loaded "
 "from, such as \"file:///path/to/your_model\". If model URI is absent, "
@@ -280,15 +256,15 @@ msgstr ""
 "如果模型 URI 不存在，Xinference 将尝试使用 model_id 从 HuggingFace 或 "
 "ModelScope 下载模型。"
 
-#: ../../source/models/custom.rst:234
+#: ../../source/models/custom.rst:136
 msgid "Define a custom Rerank model"
 msgstr "定义自定义 rerank 模型"
 
-#: ../../source/models/custom.rst:236
+#: ../../source/models/custom.rst:138
 msgid "Define a custom rerank model based on the following template:"
 msgstr "基于以下模板定义一个自定义大语言模型："
 
-#: ../../source/models/custom.rst:249
+#: ../../source/models/custom.rst:151
 msgid ""
 "type: A string defining the type of the model, including ``normal``, "
 "``LLM-based`` and ``LLM-based layerwise``."
@@ -296,20 +272,20 @@ msgstr ""
 "type: 表示模型的类型，可选值包括 ``normal``、``LLM-based`` 和 ``LLM-based"
 " layerwise``。"
 
-#: ../../source/models/custom.rst:256
+#: ../../source/models/custom.rst:158
 msgid "Register a Custom Model"
 msgstr "注册一个自定义模型"
 
-#: ../../source/models/custom.rst:258
+#: ../../source/models/custom.rst:160
 msgid "Register a custom model programmatically:"
 msgstr "以代码的方式注册自定义模型"
 
-#: ../../source/models/custom.rst:273 ../../source/models/custom.rst:291
-#: ../../source/models/custom.rst:306 ../../source/models/custom.rst:361
+#: ../../source/models/custom.rst:175 ../../source/models/custom.rst:193
+#: ../../source/models/custom.rst:208 ../../source/models/custom.rst:263
 msgid "Or via CLI:"
 msgstr "以命令行的方式"
 
-#: ../../source/models/custom.rst:279
+#: ../../source/models/custom.rst:181
 msgid ""
 "Note that replace the ``<model_type>`` above with ``LLM``, ``embedding`` "
 "or ``rerank``. The same as below."
@@ -317,43 +293,43 @@ msgstr ""
 "注意将以下部分的 ``<model_type>`` 替换为 ``LLM``、``embedding`` 或 ``"
 "rerank`` 。"
 
-#: ../../source/models/custom.rst:283
+#: ../../source/models/custom.rst:185
 msgid "List the Built-in and Custom Models"
 msgstr "列举内置和自定义模型"
 
-#: ../../source/models/custom.rst:285
+#: ../../source/models/custom.rst:187
 msgid "List built-in and custom models programmatically:"
 msgstr "以代码的方式列举内置和自定义模型"
 
-#: ../../source/models/custom.rst:298
+#: ../../source/models/custom.rst:200
 msgid "Launch the Custom Model"
 msgstr "启动自定义模型"
 
-#: ../../source/models/custom.rst:300
+#: ../../source/models/custom.rst:202
 msgid "Launch the custom model programmatically:"
 msgstr "以代码的方式启动自定义模型"
 
-#: ../../source/models/custom.rst:313
+#: ../../source/models/custom.rst:215
 msgid "Interact with the Custom Model"
 msgstr "使用自定义模型"
 
-#: ../../source/models/custom.rst:315
+#: ../../source/models/custom.rst:217
 msgid "Invoke the model programmatically:"
 msgstr "以代码的方式调用模型"
 
-#: ../../source/models/custom.rst:322
+#: ../../source/models/custom.rst:224
 msgid "Result:"
 msgstr "结果为："
 
-#: ../../source/models/custom.rst:346
+#: ../../source/models/custom.rst:248
 msgid "Or via CLI, replace ``${UID}`` with real model UID:"
 msgstr "或者以命令行的方式，用实际的模型 UID 替换 ``${UID}``："
 
-#: ../../source/models/custom.rst:353
+#: ../../source/models/custom.rst:255
 msgid "Unregister the Custom Model"
 msgstr "注销自定义模型"
 
-#: ../../source/models/custom.rst:355
+#: ../../source/models/custom.rst:257
 msgid "Unregister the custom model programmatically:"
 msgstr "以代码的方式注销自定义模型"
 
diff --git a/doc/source/models/custom.rst b/doc/source/models/custom.rst
index 071785f32d..ebdf5df153 100644
--- a/doc/source/models/custom.rst
+++ b/doc/source/models/custom.rst
@@ -59,25 +59,22 @@ Define a custom LLM model based on the following template:
    {
      "version": 1,
      "context_length": 2048,
-     "model_name": "custom-llama-2",
+     "model_name": "custom-llama-2-chat",
      "model_lang": [
        "en"
      ],
      "model_ability": [
-       "generate"
+       "chat"
      ],
-     "model_family": "llama-2",
+     "model_family": "my-llama-2-chat",
      "model_specs": [
        {
          "model_format": "pytorch",
          "model_size_in_billions": 7,
          "quantizations": [
-           "4-bit",
-           "8-bit",
            "none"
          ],
-         "model_id": "meta-llama/Llama-2-7b-hf",
-         "model_uri": "file:///path/to/llama-2-7b-hf"
+         "model_uri": "file:///path/to/llama-2-chat"
        },
        {
          "model_format": "ggufv2",
@@ -86,18 +83,20 @@ Define a custom LLM model based on the following template:
            "q4_0",
            "q8_0"
          ],
-         "model_id": "TheBloke/Llama-2-7B-GGUF",
-         "model_file_name_template": "llama-2-7b.{quantization}.gguf"
+         "model_file_name_template": "llama-2-chat-7b.{quantization}.gguf"
          "model_uri": "file:///path/to/gguf-file"
        }
-     ]
+     ],
+     "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '</s>' }}{% endif %}{% endfor %}",
+     "stop_token_ids": [2],
+     "stop": []
    }
 
 * model_name: A string defining the name of the model. The name must start with a letter or a digit and can only contain letters, digits, underscores, or dashes.
 * context_length: context_length: An optional integer that specifies the maximum context size the model was trained to accommodate, encompassing both the input and output lengths. If not defined, the default value is 2048 tokens (~1,500 words).
 * model_lang: A list of strings representing the supported languages for the model. Example: ["en"], which means that the model supports English.
 * model_ability: A list of strings defining the abilities of the model. It could include options like "embed", "generate", and "chat". In this case, the model has the ability to "generate".
-* model_family: A required string representing the family of the model you want to register. The optional values are the model names of all :ref:`built-in models <models_llm_index>`. If the model family you register is not among the built-in models in Xinference, please fill in ``other``. Note that you should choose the model family based on the ability of the model you want to register. For example, if you want to register the ``llama-2`` model, do not fill in ``llama-2-chat`` as the model family.
+* model_family: A required string representing the family of the model you want to register. This parameter must not conflict with any builtin model names.
 * model_specs: An array of objects defining the specifications of the model. These include:
    * model_format: A string that defines the model format, like "pytorch" or "ggufv2".
    * model_size_in_billions: An integer defining the size of the model in billions of parameters.
@@ -105,106 +104,9 @@ Define a custom LLM model based on the following template:
    * model_id: A string representing the model ID, possibly referring to an identifier used by Hugging Face. **If model_uri is missing, Xinference will try to download the model from the huggingface repository specified here.**.
    * model_uri: A string representing the URI where the model can be loaded from, such as "file:///path/to/llama-2-7b". **When the model format is ggufv2, model_uri must be the specific file path. When the model format is pytorch, model_uri must be the path to the directory containing the model files.** If model URI is absent, Xinference will try to download the model from Hugging Face with the model ID.
    * model_file_name_template: Required by gguf models. An f-string template used for defining the model file name based on the quantization. **Note that this field is just a template for the format of the ggufv2 model file, do not fill in the specific path of the model file.**
-* prompt_style: If the ``model_family`` field is not ``other``, this field does not need to be filled in. ``prompt_style`` is an optional field that could be required by ``chat`` models to define the style of prompts. The given example has this set to None, but additional details could be found in a referenced file xinference/model/llm/tests/test_utils.py. You can also specify this field as a string, which will use the builtin prompt style in Xinference. For example:
-
-.. code-block:: json
-
-    {
-        "model_specs": [...],
-        "prompt_style": "chatglm3"
-    }
-
-Xinference supports these builtin prompt styles in common usage:
-
-.. tabs::
-
-   .. tab:: baichuan-chat
-
-      .. code-block:: json
-
-        {
-          "style_name": "NO_COLON_TWO",
-          "system_prompt": "",
-          "roles": [
-            " <reserved_102> ",
-            " <reserved_103> "
-          ],
-          "intra_message_sep": "",
-          "inter_message_sep": "</s>",
-          "stop_token_ids": [
-            2,
-            195
-          ]
-        }
-
-   .. tab:: chatglm3
-
-      .. code-block:: json
-
-        {
-          "style_name": "CHATGLM3",
-          "system_prompt": "",
-          "roles": [
-            "user",
-            "assistant"
-          ]
-        }
-
-   .. tab:: qwen-chat
-
-      .. code-block:: json
-
-        {
-          "style_name": "QWEN",
-          "system_prompt": "You are a helpful assistant.",
-          "roles": [
-            "user",
-            "assistant"
-          ],
-          "intra_message_sep": "\n",
-          "stop_token_ids": [
-            151643
-          ]
-        }
-
-   .. tab:: llama-2-chat
-
-      .. code-block:: json
-
-        {
-          "style_name": "LLAMA2",
-          "system_prompt": "<s>[INST] <<SYS>>\nYou are a helpful AI assistant.\n<</SYS>>\n\n",
-          "roles": [
-            "[INST]",
-            "[/INST]"
-          ],
-          "intra_message_sep": " ",
-          "inter_message_sep": " </s><s>",
-          "stop_token_ids": [
-            2
-          ],
-          "stop": [
-            "</s>"
-          ]
-        }
-
-   .. tab:: vicuna-v1.5
-
-      .. code-block:: json
-
-        {
-          "style_name": "ADD_COLON_TWO",
-          "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
-          "roles": [
-            "USER",
-            "ASSISTANT"
-          ],
-          "intra_message_sep": " ",
-          "inter_message_sep": "</s>"
-        }
-
-The above lists some commonly used built-in prompt styles.
-The full list of supported prompt styles can be found on the Xinference web UI.
+* chat_template: If ``model_ability`` includes ``chat`` , you must configure this option to generate the correct full prompt during chat. This is a Jinja template string. Usually, you can find it in the ``tokenizer_config.json`` file within the model directory.
+* stop_token_ids: If ``model_ability`` includes ``chat`` , you can configure this option to control when the model stops during chat. This is a list of integers, and you can typically extract the corresponding values from the ``generation_config.json`` or ``tokenizer_config.json`` file in the model directory.
+* stop: If ``model_ability`` includes ``chat`` , you can configure this option to control when the model stops during chat. This is a list of strings, and you can typically extract the corresponding values from the ``generation_config.json`` or ``tokenizer_config.json`` file in the model directory.
 
 Define a custom embedding model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 91c3960a1ad23d2ee858cf019ec2c3034f3811f5 Mon Sep 17 00:00:00 2001
From: Chengjie Li <109656400+ChengjieLi28@users.noreply.github.com>
Date: Thu, 5 Sep 2024 16:27:15 +0800
Subject: [PATCH 15/15] Apply suggestions from code review

Co-authored-by: Xuye Qin <qinxuye@gmail.com>
---
 doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po b/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
index 196564293a..878e084003 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
@@ -186,7 +186,7 @@ msgid ""
 "This is a Jinja template string. Usually, you can find it in the "
 "``tokenizer_config.json`` file within the model directory."
 msgstr ""
-"如果 ``model_ability`` 中包含 ``chat`` ，那么此选项必须配置以生成合适的完整提示词。这是一个 Jinja 模版字符串。"
+"chat_template：如果 ``model_ability`` 中包含 ``chat`` ，那么此选项必须配置以生成合适的完整提示词。这是一个 Jinja 模版字符串。"
 "通常，你可以在模型目录的 ``tokenizer_config.json`` 文件中找到。"
 
 #: ../../source/models/custom.rst:108
@@ -197,7 +197,7 @@ msgid ""
 "values from the ``generation_config.json`` or ``tokenizer_config.json`` "
 "file in the model directory."
 msgstr ""
-"如果 ``model_ability`` 中包含 ``chat`` ，那么推荐配置此选项以合理控制对话的停止。这是一个包含整数的列表，你可以"
+"stop_token_ids：如果 ``model_ability`` 中包含 ``chat`` ，那么推荐配置此选项以合理控制对话的停止。这是一个包含整数的列表，你可以"
 "在模型目录的 ``generation_config.json`` 和 ``tokenizer_config.json`` 文件中提取相应的值。"
 
 #: ../../source/models/custom.rst:109
@@ -208,7 +208,7 @@ msgid ""
 "``generation_config.json`` or ``tokenizer_config.json`` file in the model"
 " directory."
 msgstr ""
-"如果 ``model_ability`` 中包含 ``chat`` ，那么推荐配置此选项以合理控制对话的停止。这是一个包含字符串的列表，"
+"stop：如果 ``model_ability`` 中包含 ``chat`` ，那么推荐配置此选项以合理控制对话的停止。这是一个包含字符串的列表，"
 "你可以在模型目录的 ``tokenizer_config.json`` 文件中找到 token 值对应的字符串。"
 
 #: ../../source/models/custom.rst:112