From 48b309ede234c70ef9fc592ab8b676cd375e9b00 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Thu, 29 Aug 2024 17:20:02 +0800 Subject: [PATCH 01/15] dev --- xinference/api/restful_api.py | 40 +- xinference/client/restful/restful_client.py | 27 +- xinference/client/tests/test_client.py | 5 +- xinference/core/chat_interface.py | 22 +- xinference/core/model.py | 43 +- xinference/core/scheduler.py | 56 +- xinference/core/supervisor.py | 5 +- .../core/tests/test_continuous_batching.py | 7 +- xinference/core/tests/test_metrics.py | 3 +- xinference/core/tests/test_restful_api.py | 3 +- xinference/core/tests/test_types.py | 3 - xinference/deploy/cmdline.py | 24 +- xinference/model/llm/__init__.py | 30 +- xinference/model/llm/llama_cpp/core.py | 43 +- xinference/model/llm/llm_family.json | 1878 +++++------------ xinference/model/llm/llm_family.py | 83 +- xinference/model/llm/llm_family_csghub.json | 53 +- .../model/llm/llm_family_modelscope.json | 1650 +++++---------- xinference/model/llm/lmdeploy/core.py | 144 +- xinference/model/llm/mlx/core.py | 97 +- xinference/model/llm/mlx/tests/test_mlx.py | 3 +- xinference/model/llm/sglang/core.py | 46 +- xinference/model/llm/tests/test_llm_family.py | 173 +- xinference/model/llm/tests/test_multimodal.py | 63 +- xinference/model/llm/tests/test_utils.py | 303 --- xinference/model/llm/transformers/chatglm.py | 472 ++--- xinference/model/llm/transformers/cogvlm2.py | 99 +- .../model/llm/transformers/cogvlm2_video.py | 256 +-- xinference/model/llm/transformers/core.py | 78 +- .../model/llm/transformers/deepseek_vl.py | 149 +- xinference/model/llm/transformers/glm4v.py | 166 +- .../model/llm/transformers/intern_vl.py | 109 +- .../model/llm/transformers/internlm2.py | 86 +- xinference/model/llm/transformers/llama_2.py | 108 - .../model/llm/transformers/minicpmv25.py | 77 +- .../model/llm/transformers/minicpmv26.py | 78 +- xinference/model/llm/transformers/omnilmm.py | 33 +- xinference/model/llm/transformers/qwen_vl.py | 118 +- .../llm/transformers/tests/test_tensorizer.py | 4 +- xinference/model/llm/transformers/utils.py | 72 +- xinference/model/llm/transformers/yi_vl.py | 104 +- xinference/model/llm/utils.py | 677 ++---- xinference/model/llm/vllm/core.py | 227 +- xinference/types.py | 4 +- 44 files changed, 2441 insertions(+), 5280 deletions(-) delete mode 100644 xinference/model/llm/transformers/llama_2.py diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index 5a80941f9f..20a84a21ad 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -57,9 +57,7 @@ from ..core.supervisor import SupervisorActor from ..core.utils import json_dumps from ..types import ( - SPECIAL_TOOL_PROMPT, ChatCompletion, - ChatCompletionMessage, Completion, CreateChatCompletion, CreateCompletion, @@ -1627,33 +1625,7 @@ async def create_chat_completion(self, request: Request) -> Response: status_code=400, detail="Invalid input. Please specify the prompt." ) - system_messages: List["ChatCompletionMessage"] = [] - system_messages_contents = [] - non_system_messages = [] - for msg in messages: - assert ( - msg.get("content") != SPECIAL_TOOL_PROMPT - ), f"Invalid message content {SPECIAL_TOOL_PROMPT}" - if msg["role"] == "system": - system_messages_contents.append(msg["content"]) - else: - non_system_messages.append(msg) - system_messages.append( - {"role": "system", "content": ". ".join(system_messages_contents)} - ) - has_tool_message = messages[-1].get("role") == "tool" - if has_tool_message: - prompt = SPECIAL_TOOL_PROMPT - system_prompt = system_messages[0]["content"] if system_messages else None - chat_history = non_system_messages # exclude the prompt - else: - prompt = None - if non_system_messages: - prompt = non_system_messages[-1]["content"] - system_prompt = system_messages[0]["content"] if system_messages else None - chat_history = non_system_messages[:-1] # exclude the prompt - model_uid = body.model try: @@ -1681,9 +1653,7 @@ async def create_chat_completion(self, request: Request) -> Response: from ..model.llm.utils import GLM4_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY model_family = desc.get("model_family", "") - function_call_models = ( - ["gorilla-openfunctions-v1"] + QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY - ) + function_call_models = QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY if model_family not in function_call_models: if body.tools: @@ -1716,9 +1686,7 @@ async def stream_results(): try: try: iterator = await model.chat( - prompt, - system_prompt, - chat_history, + messages, kwargs, raw_params=raw_kwargs, ) @@ -1750,9 +1718,7 @@ async def stream_results(): else: try: data = await model.chat( - prompt, - system_prompt, - chat_history, + messages, kwargs, raw_params=raw_kwargs, ) diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py index 679f65d296..5958ca4134 100644 --- a/xinference/client/restful/restful_client.py +++ b/xinference/client/restful/restful_client.py @@ -13,7 +13,6 @@ # limitations under the License. import json import typing -import warnings from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union import requests @@ -470,9 +469,7 @@ def generate( class RESTfulChatModelHandle(RESTfulGenerateModelHandle): def chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List["ChatCompletionMessage"]] = None, + messages: List[Dict], tools: Optional[List[Dict]] = None, generate_config: Optional[ Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"] @@ -483,11 +480,7 @@ def chat( Parameters ---------- - prompt: str - The user's input. - system_prompt: Optional[str] - The system context provide to Model prior to any chats. - chat_history: Optional[List["ChatCompletionMessage"]] + messages: List[Dict] A list of messages comprising the conversation so far. tools: Optional[List[Dict]] A tool list. @@ -509,25 +502,11 @@ def chat( Report the failure to generate the chat from the server. Detailed information provided in error message. """ - warnings.warn( - "The parameters `prompt`, `system_prompt` and `chat_history` will be deprecated in version v0.15.0, " - "and will be replaced by the parameter `messages`, " - "similar to the OpenAI API: https://platform.openai.com/docs/guides/chat-completions/getting-started", - category=DeprecationWarning, - stacklevel=2, - ) - url = f"{self._base_url}/v1/chat/completions" - if chat_history is None: - chat_history = [] - - chat_history = handle_system_prompts(chat_history, system_prompt) - chat_history.append({"role": "user", "content": prompt}) # type: ignore - request_body: Dict[str, Any] = { "model": self._model_uid, - "messages": chat_history, + "messages": messages, } if tools is not None: request_body["tools"] = tools diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py index 095ef5e182..e6bd554129 100644 --- a/xinference/client/tests/test_client.py +++ b/xinference/client/tests/test_client.py @@ -73,12 +73,13 @@ def test_RESTful_client(setup): with pytest.raises(RuntimeError): completion = model.chat({"max_tokens": 64}) - completion = model.chat("What is the capital of France?") + messages = {"role": "user", "content": "What is the capital of France?"} + completion = model.chat(messages) assert "content" in completion["choices"][0]["message"] def _check_stream(): streaming_response = model.chat( - prompt="What is the capital of France?", + messages, generate_config={"stream": True, "max_tokens": 5}, ) for chunk in streaming_response: diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py index 8738141f90..9de2dab252 100644 --- a/xinference/core/chat_interface.py +++ b/xinference/core/chat_interface.py @@ -16,7 +16,7 @@ import logging import os from io import BytesIO -from typing import Generator, List, Optional +from typing import Dict, Generator, List, Optional import gradio as gr import PIL.Image @@ -27,7 +27,6 @@ RESTfulChatModelHandle, RESTfulGenerateModelHandle, ) -from ..types import ChatCompletionMessage logger = logging.getLogger(__name__) @@ -96,11 +95,11 @@ def flatten(matrix: List[List[str]]) -> List[str]: flat_list += row return flat_list - def to_chat(lst: List[str]) -> List[ChatCompletionMessage]: + def to_chat(lst: List[str]) -> List[Dict]: res = [] for i in range(len(lst)): role = "assistant" if i % 2 == 1 else "user" - res.append(ChatCompletionMessage(role=role, content=lst[i])) + res.append(dict(role=role, content=lst[i])) return res def generate_wrapper( @@ -116,11 +115,12 @@ def generate_wrapper( client._set_token(self._access_token) model = client.get_model(self.model_uid) assert isinstance(model, RESTfulChatModelHandle) + messages = to_chat(flatten(history)) + messages.append(dict(role="user", content=message)) response_content = "" for chunk in model.chat( - prompt=message, - chat_history=to_chat(flatten(history)), + messages, generate_config={ "max_tokens": int(max_tokens), "temperature": temperature, @@ -191,15 +191,10 @@ def predict(history, bot, max_tokens, temperature, stream): model = client.get_model(self.model_uid) assert isinstance(model, RESTfulChatModelHandle) - prompt = history[-1] - assert prompt["role"] == "user" - prompt = prompt["content"] - # multimodal chat does not support stream. if stream: response_content = "" for chunk in model.chat( - prompt=prompt, - chat_history=history[:-1], + messages=history, generate_config={ "max_tokens": max_tokens, "temperature": temperature, @@ -224,8 +219,7 @@ def predict(history, bot, max_tokens, temperature, stream): yield history, bot else: response = model.chat( - prompt=prompt, - chat_history=history[:-1], + messages=history, generate_config={ "max_tokens": max_tokens, "temperature": temperature, diff --git a/xinference/core/model.py b/xinference/core/model.py index 4b08a4e9a8..10ab759fe6 100644 --- a/xinference/core/model.py +++ b/xinference/core/model.py @@ -439,9 +439,7 @@ async def _call_wrapper(self, output_type: str, fn: Callable, *args, **kwargs): @log_async(logger=logger) async def generate(self, prompt: str, *args, **kwargs): if self.allow_batching(): - return await self.handle_batching_request( - prompt, "generate", *args, **kwargs - ) + return await self.handle_batching_request(prompt, *args, **kwargs) else: kwargs.pop("raw_params", None) if hasattr(self._model, "generate"): @@ -481,22 +479,27 @@ async def _queue_consumer( yield res @staticmethod - def _get_stream_from_args(ability: str, *args) -> bool: - if ability == "chat": - assert args[2] is None or isinstance(args[2], dict) - return False if args[2] is None else args[2].get("stream", False) - else: - assert args[0] is None or isinstance(args[0], dict) - return False if args[0] is None else args[0].get("stream", False) + def _get_stream_from_args(*args) -> bool: + assert args[0] is None or isinstance(args[0], dict) + return False if args[0] is None else args[0].get("stream", False) - async def handle_batching_request(self, prompt: str, ability: str, *args, **kwargs): - stream = self._get_stream_from_args(ability, *args) + async def handle_batching_request( + self, prompt_or_messages: Union[str, List[Dict]], *args, **kwargs + ): + """ + The input parameter `prompt_or_messages`: + - when the model_ability is `generate`, it's `prompt`, which is str type. + - when the model_ability is `chat`, it's `messages`, which is List[Dict] type. + """ + stream = self._get_stream_from_args(*args) assert self._scheduler_ref is not None if stream: assert self._scheduler_ref is not None queue: Queue[Any] = Queue() ret = self._queue_consumer(queue) - await self._scheduler_ref.add_request(prompt, queue, *args, **kwargs) + await self._scheduler_ref.add_request( + prompt_or_messages, queue, *args, **kwargs + ) gen = self._to_async_gen("json", ret) self._current_generator = weakref.ref(gen) return gen @@ -505,7 +508,9 @@ async def handle_batching_request(self, prompt: str, ability: str, *args, **kwar assert self._loop is not None future = ConcurrentFuture() - await self._scheduler_ref.add_request(prompt, future, *args, **kwargs) + await self._scheduler_ref.add_request( + prompt_or_messages, future, *args, **kwargs + ) fut = asyncio.wrap_future(future, loop=self._loop) result = await fut if result == XINFERENCE_NON_STREAMING_ABORT_FLAG: @@ -517,24 +522,22 @@ async def handle_batching_request(self, prompt: str, ability: str, *args, **kwar @request_limit @xo.generator @log_async(logger=logger) - async def chat(self, prompt: str, *args, **kwargs): + async def chat(self, messages: List[Dict], *args, **kwargs): start_time = time.time() response = None try: if self.allow_batching(): - return await self.handle_batching_request( - prompt, "chat", *args, **kwargs - ) + return await self.handle_batching_request(messages, *args, **kwargs) else: kwargs.pop("raw_params", None) if hasattr(self._model, "chat"): response = await self._call_wrapper_json( - self._model.chat, prompt, *args, **kwargs + self._model.chat, messages, *args, **kwargs ) return response if hasattr(self._model, "async_chat"): response = await self._call_wrapper_json( - self._model.async_chat, prompt, *args, **kwargs + self._model.async_chat, messages, *args, **kwargs ) return response raise AttributeError(f"Model {self._model.model_spec} is not for chat.") diff --git a/xinference/core/scheduler.py b/xinference/core/scheduler.py index 6b28f70259..842b8bd737 100644 --- a/xinference/core/scheduler.py +++ b/xinference/core/scheduler.py @@ -18,7 +18,7 @@ import uuid from collections import deque from enum import Enum -from typing import List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple, Union import xoscar as xo @@ -37,9 +37,11 @@ class AbortRequestMessage(Enum): class InferenceRequest: - def __init__(self, prompt, future_or_queue, is_prefill, *args, **kwargs): - # original prompt - self._prompt = prompt + def __init__( + self, prompt_or_messages, future_or_queue, is_prefill, *args, **kwargs + ): + # original prompt, prompt(str) for generate model and messages(List[Dict]) for chat model + self._prompt = prompt_or_messages # full prompt that contains chat history and applies chat template self._full_prompt = None # whether the current request is in the prefill phase @@ -88,29 +90,17 @@ def __init__(self, prompt, future_or_queue, is_prefill, *args, **kwargs): self._check_args() def _check_args(self): - # chat - if len(self._inference_args) == 3: - # system prompt - assert self._inference_args[0] is None or isinstance( - self._inference_args[0], str - ) - # chat history - assert self._inference_args[1] is None or isinstance( - self._inference_args[1], list - ) - # generate config - assert self._inference_args[2] is None or isinstance( - self._inference_args[2], dict - ) - else: # generate - assert len(self._inference_args) == 1 - # generate config - assert self._inference_args[0] is None or isinstance( - self._inference_args[0], dict - ) + assert len(self._inference_args) == 1 + # generate config + assert self._inference_args[0] is None or isinstance( + self._inference_args[0], dict + ) @property def prompt(self): + """ + prompt for generate model and messages for chat model + """ return self._prompt @property @@ -162,11 +152,7 @@ def append_new_token(self, token: int): @property def generate_config(self): - return ( - self._inference_args[2] - if len(self._inference_args) == 3 - else self._inference_args[0] - ) + return self._inference_args[0] @property def sanitized_generate_config(self): @@ -423,8 +409,16 @@ async def step(self): self._empty_cache() - async def add_request(self, prompt: str, future_or_queue, *args, **kwargs): - req = InferenceRequest(prompt, future_or_queue, True, *args, **kwargs) + async def add_request( + self, + prompt_or_messages: Union[str, List[Dict]], + future_or_queue, + *args, + **kwargs, + ): + req = InferenceRequest( + prompt_or_messages, future_or_queue, True, *args, **kwargs + ) rid = req.request_id if rid is not None: if rid in self._id_to_req: diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py index a18d926c7b..61fc4caa8e 100644 --- a/xinference/core/supervisor.py +++ b/xinference/core/supervisor.py @@ -310,10 +310,7 @@ async def get_cluster_device_info(self, detailed: bool = False) -> List: async def get_builtin_prompts() -> Dict[str, Any]: from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE - data = {} - for k, v in BUILTIN_LLM_PROMPT_STYLE.items(): - data[k] = v.dict() - return data + return {k: v for k, v in BUILTIN_LLM_PROMPT_STYLE.items()} @staticmethod async def get_builtin_families() -> Dict[str, List[str]]: diff --git a/xinference/core/tests/test_continuous_batching.py b/xinference/core/tests/test_continuous_batching.py index f6db0362cf..c58b91bb55 100644 --- a/xinference/core/tests/test_continuous_batching.py +++ b/xinference/core/tests/test_continuous_batching.py @@ -48,7 +48,7 @@ def join(self, timeout=None): class InferenceThread(BaseThread): def __init__(self, prompt, generate_config, client, model): super().__init__() - self._prompt = prompt + self._prompt = [{"role": "user", "content": prompt}] self._generate_config = generate_config self._client = client self._model = model @@ -159,11 +159,12 @@ def test_continuous_batching(enable_batch, setup): thread2.join() # test error generate config + messages = [{"role": "user", "content": "你好"}] with pytest.raises(RuntimeError): - model.chat("你好", generate_config={"max_tokens": 99999999999999999}) + model.chat(messages, generate_config={"max_tokens": 99999999999999999}) with pytest.raises(RuntimeError): - model.chat("你好", generate_config={"stream_interval": 0}) + model.chat(messages, generate_config={"stream_interval": 0}) # test error with other correct requests thread1 = InferenceThread("1+1=3正确吗?", {"stream": True}, client, model) diff --git a/xinference/core/tests/test_metrics.py b/xinference/core/tests/test_metrics.py index 0004c5932f..4bcd2c3bbd 100644 --- a/xinference/core/tests/test_metrics.py +++ b/xinference/core/tests/test_metrics.py @@ -140,7 +140,8 @@ async def test_metrics_exporter_data(setup_cluster): ) model = client.get_model(model_uid) - response = model.chat("write a poem.") + messages = [{"role": "user", "content": "write a poem."}] + response = model.chat(messages) response = requests.get(metrics_exporter_address) assert response.ok diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py index cd47b98cc5..0c50eb256d 100644 --- a/xinference/core/tests/test_restful_api.py +++ b/xinference/core/tests/test_restful_api.py @@ -526,7 +526,8 @@ def test_restful_api_for_tool_calls(setup, model_format, quantization): client = RESTfulClient(endpoint) model = client.get_model(model_uid_res) - completion = model.chat("帮我查询股票10111的价格", tools=tools) + messages = [{"role": "user", "content": "帮我查询股票10111的价格"}] + completion = model.chat(messages, tools=tools) assert "content" in completion["choices"][0]["message"] assert "tool_calls" == completion["choices"][0]["finish_reason"] assert ( diff --git a/xinference/core/tests/test_types.py b/xinference/core/tests/test_types.py index 8dd3fdbd63..bfcd9d89dd 100644 --- a/xinference/core/tests/test_types.py +++ b/xinference/core/tests/test_types.py @@ -82,9 +82,6 @@ def test_create_chat_completion_types(): with pytest.raises(ValidationError): CreateChatCompletion(model="abc", not_exist="jdk") - # with pytest.raises(pydantic.ValidationError): - # CreateChatCompletion(model="abc", messages=[{"role": "invalid"}]) - CreateChatCompletion(model="abc", messages=[{"role": "tool"}], max_tokens=None) types = [CreateChatCompletionTorch, CreateChatCompletionLlamaCpp] diff --git a/xinference/deploy/cmdline.py b/xinference/deploy/cmdline.py index 8eea848077..f0f09720a5 100644 --- a/xinference/deploy/cmdline.py +++ b/xinference/deploy/cmdline.py @@ -17,7 +17,7 @@ import os import sys import warnings -from typing import List, Optional, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union import click from xoscar.utils import get_next_port @@ -38,7 +38,6 @@ XINFERENCE_LOG_MAX_BYTES, ) from ..isolation import Isolation -from ..types import ChatCompletionMessage from .utils import ( get_config_dict, get_log_file, @@ -1210,13 +1209,12 @@ def model_chat( stream: bool, api_key: Optional[str], ): - # TODO: chat model roles may not be user and assistant. endpoint = get_endpoint(endpoint) client = RESTfulClient(base_url=endpoint, api_key=api_key) if api_key is None: client._set_token(get_stored_token(endpoint, client)) - chat_history: "List[ChatCompletionMessage]" = [] + messages: List[Dict] = [] if stream: # TODO: when stream=True, RestfulClient cannot generate words one by one. # So use Client in temporary. The implementation needs to be changed to @@ -1229,10 +1227,10 @@ async def chat_internal(): if prompt == "": break print("Assistant: ", end="", file=sys.stdout) + messages.append(dict(role="user", content=prompt)) response_content = "" for chunk in model.chat( - prompt=prompt, - chat_history=chat_history, + messages, generate_config={"stream": stream, "max_tokens": max_tokens}, ): delta = chunk["choices"][0]["delta"] @@ -1242,10 +1240,7 @@ async def chat_internal(): response_content += delta["content"] print(delta["content"], end="", flush=True, file=sys.stdout) print("", file=sys.stdout) - chat_history.append(ChatCompletionMessage(role="user", content=prompt)) - chat_history.append( - ChatCompletionMessage(role="assistant", content=response_content) - ) + messages.append(dict(role="assistant", content=response_content)) model = client.get_model(model_uid=model_uid) @@ -1274,20 +1269,17 @@ async def chat_internal(): prompt = input("User: ") if prompt == "": break - chat_history.append(ChatCompletionMessage(role="user", content=prompt)) + messages.append({"role": "user", "content": prompt}) print("Assistant: ", end="", file=sys.stdout) response = restful_model.chat( - prompt=prompt, - chat_history=chat_history, + messages, generate_config={"stream": stream, "max_tokens": max_tokens}, ) if not isinstance(response, dict): raise ValueError("chat result is not valid") response_content = response["choices"][0]["message"]["content"] print(f"{response_content}\n", file=sys.stdout) - chat_history.append( - ChatCompletionMessage(role="assistant", content=response_content) - ) + messages.append(dict(role="assistant", content=response_content)) @cli.command("vllm-models", help="Query and display models compatible with vLLM.") diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index fc63c0b27a..ae9f9c4d55 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -45,7 +45,6 @@ LLMFamilyV1, LLMSpecV1, MLXLLMSpecV1, - PromptStyleV1, PytorchLLMSpecV1, get_cache_status, get_user_defined_llm_families, @@ -141,7 +140,6 @@ def _install(): from .transformers.glm4v import Glm4VModel from .transformers.intern_vl import InternVLChatModel from .transformers.internlm2 import Internlm2PytorchChatModel - from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel from .transformers.minicpmv25 import MiniCPMV25Model from .transformers.minicpmv26 import MiniCPMV26Model from .transformers.qwen_vl import QwenVLChatModel @@ -170,8 +168,6 @@ def _install(): TRANSFORMERS_CLASSES.extend( [ ChatglmPytorchChatModel, - LlamaPytorchModel, - LlamaPytorchChatModel, PytorchChatModel, Internlm2PytorchChatModel, QwenVLChatModel, @@ -204,13 +200,17 @@ def _install(): model_spec = LLMFamilyV1.parse_obj(json_obj) BUILTIN_LLM_FAMILIES.append(model_spec) - # register prompt style + # register chat_template if "chat" in model_spec.model_ability and isinstance( - model_spec.prompt_style, PromptStyleV1 + model_spec.chat_template, str ): # note that the key is the model name, # since there are multiple representations of the same prompt style name in json. - BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style + BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = { + "chat_template": model_spec.chat_template, + "stop_token_ids": model_spec.stop_token_ids, + "stop": model_spec.stop, + } # register model family if "chat" in model_spec.model_ability: BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name) @@ -230,10 +230,14 @@ def _install(): # if duplicated with huggingface json, keep it as the huggingface style if ( "chat" in model_spec.model_ability - and isinstance(model_spec.prompt_style, PromptStyleV1) + and isinstance(model_spec.chat_template, str) and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE ): - BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style + BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = { + "chat_template": model_spec.chat_template, + "stop_token_ids": model_spec.stop_token_ids, + "stop": model_spec.stop, + } # register model family if "chat" in model_spec.model_ability: BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name) @@ -253,10 +257,14 @@ def _install(): # if duplicated with huggingface json, keep it as the huggingface style if ( "chat" in model_spec.model_ability - and isinstance(model_spec.prompt_style, PromptStyleV1) + and isinstance(model_spec.chat_template, str) and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE ): - BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style + BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = { + "chat_template": model_spec.chat_template, + "stop_token_ids": model_spec.stop_token_ids, + "stop": model_spec.stop, + } # register model family if "chat" in model_spec.model_ability: BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name) diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index b820fce466..30a835ff7c 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -14,12 +14,11 @@ import logging import os import time -from typing import Iterable, Iterator, List, Optional, Union +from typing import Dict, Iterator, List, Optional, Union from ....types import ( ChatCompletion, ChatCompletionChunk, - ChatCompletionMessage, Completion, CompletionChunk, CompletionUsage, @@ -181,10 +180,12 @@ def generator_wrapper( for index, _completion_chunk in enumerate( self._llm(prompt=_prompt, **_generate_config) ): + _completion_chunk["model"] = self.model_uid request_id = _completion_chunk["id"] choice = _completion_chunk["choices"][0] if choice["finish_reason"] is not None: completion_tokens = index + choice.pop("text", None) total_tokens = prompt_tokens + completion_tokens _completion_chunk["usage"] = CompletionUsage( prompt_tokens=total_tokens, @@ -262,39 +263,25 @@ def _sanitize_generate_config( self, generate_config: Optional[LlamaCppGenerateConfig] ) -> LlamaCppGenerateConfig: generate_config = super()._sanitize_generate_config(generate_config) - if self.model_family.prompt_style and self.model_family.prompt_style.stop: - generate_config["stop"] = self.model_family.prompt_style.stop + if self.model_family.stop and self.model_family.stop: + generate_config["stop"] = self.model_family.stop.copy() return generate_config def chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[LlamaCppGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - assert self.model_family.prompt_style is not None - prompt_style = self.model_family.prompt_style.copy() - if system_prompt: - prompt_style.system_prompt = system_prompt - - chat_history = chat_history or [] - assert prompt_style is not None + model_family = self.model_family.model_family or self.model_family.model_name tools = generate_config.pop("tools", []) if generate_config else None - full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools) + full_context_kwargs = {} + if tools and model_family in QWEN_TOOL_CALL_FAMILY: + full_context_kwargs["tools"] = tools + full_prompt = self.get_full_context( + messages, self.model_family.chat_template, **full_context_kwargs + ) generate_config = self._sanitize_generate_config(generate_config) - # TODO(codingl2k1): qwen hacky to set stop for function call. - model_family = self.model_family.model_family or self.model_family.model_name - if tools and model_family in QWEN_TOOL_CALL_FAMILY: - stop = generate_config.get("stop") - if isinstance(stop, str): - generate_config["stop"] = [stop, "Observation:"] - elif isinstance(stop, Iterable): - assert not isinstance(stop, str) - generate_config["stop"] = stop + ["Observation:"] # type: ignore - else: - generate_config["stop"] = "Observation:" stream = generate_config.get("stream", False) if stream: @@ -305,7 +292,5 @@ def chat( c = self.generate(full_prompt, generate_config) assert not isinstance(c, Iterator) if tools: - return self._tool_calls_completion( - self.model_family, self.model_uid, c, tools - ) + return self._tool_calls_completion(self.model_family, self.model_uid, c) return self._to_chat_completion(c) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 26f1d599a8..198123430b 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -46,24 +46,15 @@ "model_revision": "3cb06f589b7b1e2f8e728c77280b1114191d24de" } ], - "prompt_style": { - "style_name": "CodeShell", - "system_prompt": "", - "roles": [ - "## human:", - "## assistant: " - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 70000 - ], - "stop": [ - "<|endoftext|>", - "|||", - "||" - ] - } + "chat_template": "{% for item in messages %}{% if item['role'] == 'user' %}{{ '## human: ' + item['content'] + '||' }}{% elif item['role'] == 'assistant' %}{{ '## assistant: ' + item['content'] + '||' }}{% endif %}{% endfor %}{{ '## assistant: ' }}", + "stop_token_ids": [ + 70000 + ], + "stop": [ + "<|endoftext|>", + "|||", + "||" + ] }, { "version": 1, @@ -134,26 +125,17 @@ "model_revision": "ebee18c488086b396dde649f2aa6548b9b8d2404" } ], - "prompt_style": { - "style_name": "PHI3", - "system_prompt": "You are a helpful AI assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "inter_message_sep": "<|end|>\n", - "stop_token_ids":[ - 32000, - 32001, - 32007 - ], - "stop": [ - "<|endoftext|>", - "<|assistant|>", - "<|end|>" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}", + "stop_token_ids":[ + 32000, + 32001, + 32007 + ], + "stop": [ + "<|endoftext|>", + "<|assistant|>", + "<|end|>" + ] }, { "version": 1, @@ -189,156 +171,17 @@ "model_revision": "b86bcaf57ea4dfdec5dbe12a377028b2fab0d480" } ], - "prompt_style": { - "style_name": "PHI3", - "system_prompt": "You are a helpful AI assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "inter_message_sep": "<|end|>\n", - "stop_token_ids":[ - 32000, - 32001, - 32007 - ], - "stop": [ - "<|endoftext|>", - "<|assistant|>", - "<|end|>" - ] - } - }, - { - "version": 1, - "context_length": 8192, - "model_name": "chatglm3", - "model_lang": [ - "en", - "zh" - ], - "model_ability": [ - "chat", - "tools" - ], - "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 6, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "THUDM/chatglm3-6b", - "model_revision": "103caa40027ebfd8450289ca2f278eac4ff26405" - } - ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 64795, - 64797, - 2 - ], - "stop": [ - "<|user|>", - "<|observation|>" - ] - } - }, - { - "version": 1, - "context_length": 32768, - "model_name": "chatglm3-32k", - "model_lang": [ - "en", - "zh" + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}", + "stop_token_ids":[ + 32000, + 32001, + 32007 ], - "model_ability": [ - "chat" - ], - "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 6, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "THUDM/chatglm3-6b-32k", - "model_revision": "339f17ff464d47b5077527c2b34e80a7719ede3e" - } - ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 64795, - 64797, - 2 - ], - "stop": [ - "<|user|>", - "<|observation|>" - ] - } - }, - { - "version": 1, - "context_length": 131072, - "model_name": "chatglm3-128k", - "model_lang": [ - "en", - "zh" - ], - "model_ability": [ - "chat" - ], - "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 6, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "THUDM/chatglm3-6b-128k", - "model_revision": "f0afbe671009abc9e31182170cf60636d5546cda" - } - ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 64795, - 64797, - 2 - ], - "stop": [ - "<|user|>", - "<|observation|>" - ] - } + "stop": [ + "<|endoftext|>", + "<|assistant|>", + "<|end|>" + ] }, { "version": 1, @@ -363,7 +206,7 @@ "none" ], "model_id": "THUDM/glm-4-9b-chat", - "model_revision": "aae8bd74af5c6dff63a49d7fbdcc89349ebf87aa" + "model_revision": "f6e0743b285dd808084530f070ad08e504386750" }, { "model_format": "ggufv2", @@ -392,24 +235,17 @@ "model_revision": "0155a14edf0176863e9a003cdd78ce599e4d62c0" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "[gMASK]{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -463,24 +299,17 @@ "model_revision": "782e28bd5eee3c514c07108da15e0b5e06dcf776" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "[gMASK]{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -505,27 +334,20 @@ "none" ], "model_id": "THUDM/glm-4v-9b", - "model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4" + "model_revision": "01328faefe122fe605c1c127b62e6031d3ffebf7" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -567,24 +389,17 @@ "model_revision": "6a04071c54c943949826d4815ee00717ed8cf153" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|>\n' + item['content'] }}{% elif loop.first %}{{ '<|system|>\n你是一位智能编程助手,你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题,并提供格式规范、可以执行、准确安全的代码,并在必要时提供详细的解释。' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -622,14 +437,13 @@ "model_revision": "1e4944aaa1d8c8d0cdca28bb8e3a003303d0781b" } ], - "prompt_style": { - "style_name": "XVERSE", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|> \n' + item['content'] }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|> \n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|> \n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}", + "stop_token_ids": [ + 3 + ], + "stop": [ + "<|endoftext|>" + ] }, { "version": 1, @@ -842,22 +656,11 @@ "model_revision": "36d9a7388cc80e5f4b3e9701ca2f250d21a96c30" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] <>\nYou are a helpful AI assistant.\n<>\n\n", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": " ", - "stop_token_ids": [ + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<>\n' + messages[0]['content'] | trim + '\n<>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ 2 - ], - "stop": [ - "" - ] - } + ], + "stop": [] }, { "version": 1, @@ -1210,24 +1013,15 @@ "model_id": "TechxGenus/Meta-Llama-3-70B-Instruct-GPTQ" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -1505,24 +1299,15 @@ "model_id": "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{{- '<|begin_of_text|>' }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -1558,8 +1343,7 @@ "zh" ], "model_ability": [ - "chat", - "tools" + "chat" ], "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.", "model_specs": [ @@ -1662,25 +1446,17 @@ "model_id": "Qwen/Qwen-72B-Chat-{quantization}" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|im_start|>system\n' + item['content'] + '<|im_end|>\n' }}{% elif loop.first %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|im_start|>user\n' + item['content'] + '<|im_end|>' }}{% elif item['role'] == 'assistant' %}{{ '<|im_start|>assistant\n' + item['content'] + '<|im_end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2025,25 +1801,17 @@ } } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2078,25 +1846,17 @@ "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2171,25 +1931,17 @@ "model_id": "Qwen/CodeQwen1.5-7B-Chat-AWQ" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2479,25 +2231,17 @@ } } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2560,25 +2304,17 @@ } } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2623,19 +2359,8 @@ "8-bit", "none" ], - "model_id": "WizardLM/WizardMath-7B-V1.0", - "model_revision": "3c3a3b33334f4b35344b22c5c7465957ee7b2c75" - }, - { - "model_format": "pytorch", - "model_size_in_billions": 13, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "WizardLM/WizardMath-13B-V1.0", - "model_revision": "ef95532e96e634c634992dab891a17032dc71c8d" + "model_id": "WizardLMTeam/WizardMath-7B-V1.0", + "model_revision": "825a586f260d6c583b8aa9ceab6cdfaa3d9a4ddc" }, { "model_format": "pytorch", @@ -2645,19 +2370,17 @@ "8-bit", "none" ], - "model_id": "WizardLM/WizardMath-70B-V1.0", - "model_revision": "e089c3f9d2ad9d1acb62425aec3f4126f498f4c5" + "model_id": "WizardLMTeam/WizardMath-70B-V1.0", + "model_revision": "4dd9f3fcd8c056561d67ec59ae011f7c146aebd2" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE_COT", - "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.", - "roles": [ - "Instruction", - "Response" - ], - "intra_message_sep": "\n\n### " - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -2979,22 +2702,13 @@ "model_file_name_template": "codellama-34b-instruct.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] <>\nWrite code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n<>\n\n", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": " ", - "stop_token_ids": [ + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<>\n' + messages[0]['content'] | trim + '\n<>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ 2 - ], - "stop": [ - "" - ] - } + ], + "stop": [ + "" + ] }, { "version": 1, @@ -3032,20 +2746,12 @@ "model_revision": "a56c793eb7a721ab6c270f779024e0375e8afd4a" } ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "", - "roles": [ - "", - "" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 195 - ] - } + "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}\n\n{% for message in messages %}\n{% if message['role'] == 'user' %}\n\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% elif message['role'] == 'assistant' %}\n\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n\n{% endif %}", + "stop_token_ids": [ + 2, + 195 + ], + "stop": [] }, { "version": 1, @@ -3189,22 +2895,13 @@ "model_file_name_template": "mistral-7b-instruct-v0.1.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] ", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + ''}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -3266,22 +2963,13 @@ "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] ", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + ''}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -3342,22 +3030,13 @@ "model_file_name_template": "Mistral-7B-Instruct-v0.3.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] ", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS] [\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST] \" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n {{- \"[TOOL_CALLS] [\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + '' }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- \" \" + message[\"content\"]|trim + ''}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -3466,22 +3145,13 @@ "model_id": "mlx-community/Mistral-Nemo-Instruct-2407-8bit" } ], - "prompt_style": { - "style_name": "mistral-nemo", - "system_prompt": "", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + '' }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + ''}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -3623,22 +3293,13 @@ "model_id": "mlx-community/Mistral-Large-Instruct-2407-8bit" } ], - "prompt_style": { - "style_name": "mistral-nemo", - "system_prompt": "", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + '' }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + ''}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -3745,22 +3406,13 @@ "model_file_name_template": "openhermes-2.5-mistral-7b.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "<|im_start|>system\nYou are 'Hermes 2.5', a conscious sentient superintelligent artificial intelligence, your purpose is to assist the user with their requests.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "<|im_end|>\n<|im_start|>", - "inter_message_sep": "", - "stop_token_ids": [ - 32000 - ], - "stop": [ - "<|im_end|>" - ] - } + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 32000 + ], + "stop": [ + "<|im_end|>" + ] }, { "version": 1, @@ -3909,16 +3561,13 @@ "model_file_name_template": "mixtral-8x7b-instruct-v0.1.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "MIXTRAL_V01", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "", - "inter_message_sep": "" - } + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + ''}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -4045,16 +3694,13 @@ } } ], - "prompt_style": { - "style_name": "MIXTRAL_V01", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "", - "inter_message_sep": "" - } + "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS] [\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST] \" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n {{- \"[TOOL_CALLS] [\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + '' }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- \" \" + message[\"content\"]|trim + ''}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -4225,28 +3871,19 @@ "model_file_name_template": "yi-34b-chat.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -4494,28 +4131,19 @@ "model_revision": "3c12761a2c6663f216caab6dff84b0dd29b472ac" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -4593,28 +4221,19 @@ "model_file_name_template": "Yi-1.5-34B-Chat-16K-{quantization}.gguf" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -4627,17 +4246,6 @@ "chat" ], "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 7, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "WizardLM/WizardCoder-Python-7B-V1.0", - "model_revision": "e40673a27a4aefcff2c6d2b3b1e0681a38703e4e" - }, { "model_format": "pytorch", "model_size_in_billions": 13, @@ -4646,8 +4254,8 @@ "8-bit", "none" ], - "model_id": "WizardLM/WizardCoder-Python-13B-V1.0", - "model_revision": "d920d26e2108377de0f676a3c4be666f5212f4a1" + "model_id": "WizardLMTeam/WizardCoder-Python-13B-V1.0", + "model_revision": "5ac6748b1f5a4c282107ddc7d3b69fdc4a686d75" }, { "model_format": "pytorch", @@ -4657,8 +4265,8 @@ "8-bit", "none" ], - "model_id": "WizardLM/WizardCoder-Python-34B-V1.0", - "model_revision": "d869ce178715f8d6e8141e2ed50e6290985eedb0" + "model_id": "WizardLMTeam/WizardCoder-Python-34B-V1.0", + "model_revision": "897fc6d9e12136c68c441b2350d015902c144b20" }, { "model_format": "ggufv2", @@ -4721,157 +4329,13 @@ "model_file_name_template": "wizardcoder-python-34b-v1.0.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.", - "roles": [ - "Instruction", - "Response" - ], - "intra_message_sep": "\n\n### ", - "stop": [ - "" - ] - } - }, - { - "version": 1, - "context_length": 8192, - "model_name": "zephyr-7b-alpha", - "model_lang": [ - "en" - ], - "model_ability": [ - "chat" - ], - "model_description": "Zephyr-7B-α is the first model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 7, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "HuggingFaceH4/zephyr-7b-alpha", - "model_revision": "f28e1c0e5a1af475bcd7bdf6554e69abc6c0c7ee" - } - ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "<|system|>\nYou are a friendly chatbot.\n", - "roles": [ - "<|user|>\n", - "<|assistant|>\n" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } - }, - { - "version": 1, - "context_length": 8192, - "model_name": "zephyr-7b-beta", - "model_lang": [ - "en" + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}", + "stop_token_ids": [ + 2 ], - "model_ability": [ - "chat" - ], - "model_description": "Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 7, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "HuggingFaceH4/zephyr-7b-beta", - "model_revision": "3bac358730f8806e5c3dc7c7e19eb36e045bf720" - } - ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "<|system|>\nYou are a friendly chatbot.\n", - "roles": [ - "<|user|>\n", - "<|assistant|>\n" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } - }, - { - "version": 1, - "context_length": 4096, - "model_name": "gorilla-openfunctions-v1", - "model_lang": [ - "en" - ], - "model_ability": [ - "chat" - ], - "model_description": "OpenFunctions is designed to extend Large Language Model (LLM) Chat Completion feature to formulate executable APIs call given natural language instructions and API context.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 7, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_id": "gorilla-llm/gorilla-openfunctions-v1", - "model_revision": "74615f614ee845eab114e71541fd5098d1709958" - }, - { - "model_format": "ggufv2", - "model_size_in_billions": 7, - "quantizations": [ - "Q2_K", - "Q3_K_L", - "Q3_K_M", - "Q3_K_S", - "Q4_0", - "Q4_K_M", - "Q4_K_S", - "Q5_0", - "Q5_K_M", - "Q5_K_S", - "Q6_K", - "Q8_0" - ], - "model_id": "TheBloke/gorilla-openfunctions-v1-GGUF", - "model_file_name_template": "gorilla-openfunctions-v1.{quantization}.gguf" - } - ], - "prompt_style": { - "style_name": "GORILLA_OPENFUNCTIONS", - "system_prompt": "", - "roles": [ - "", - "" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [], - "stop": [] - } + "stop": [ + "" + ] }, { "version": 1, @@ -4913,18 +4377,15 @@ "model_file_name_template": "gorilla-openfunctions-v2.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "GORILLA_OPENFUNCTIONS", - "system_prompt": "", - "roles": [ - "", - "" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [], - "stop": [] - } + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{'<|begin▁of▁sentence|>'}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\n' + message['content'] + '\n'}}\n {%- else %}\n{{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}", + "stop_token_ids": [ + 100015, + 100001 + ], + "stop": [ + "<|EOT|>", + "<|end▁of▁sentence|>" + ] }, { "version": 1, @@ -4959,19 +4420,13 @@ "model_revision": "6f16f00805f45b5249f709ce21820122eeb43556" } ], - "prompt_style": { - "style_name": "DEEPSEEK_CHAT", - "system_prompt": "<|begin▁of▁sentence|>", - "roles": [ - "User", - "Assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|end▁of▁sentence|>", - "stop": [ - "<|end▁of▁sentence|>" - ] - } + "chat_template": "", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] }, { "version": 1, @@ -5126,19 +4581,13 @@ "model_file_name_template": "deepseek-llm-67b-chat.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "DEEPSEEK_CHAT", - "system_prompt": "<|begin▁of▁sentence|>", - "roles": [ - "User", - "Assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|end▁of▁sentence|>", - "stop": [ - "<|end▁of▁sentence|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] }, { "version": 1, @@ -5523,18 +4972,13 @@ "model_revision": "c40b499bac2712cd3c445cf1b05d2c6558ab0d29" } ], - "prompt_style": { - "style_name": "DEEPSEEK_CODER", - "system_prompt": "You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.", - "roles": [ - "### Instruction:", - "### Response:" - ], - "inter_message_sep": "\n", - "stop": [ - "<|EOT|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{'<|begin▁of▁sentence|>'}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\n' + message['content'] + '\n'}}\n {%- else %}\n{{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}", + "stop_token_ids": [ + 32021 + ], + "stop": [ + "<|EOT|>" + ] }, { "version": 1, @@ -5618,23 +5062,15 @@ "model_revision": "b666125047cd98c5a7c85ca28720b44a06aed124" } ], - "prompt_style": { - "style_name": "INTERNLM2", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92542 - ], - "stop": [ - "", - "<|im_end|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542 + ], + "stop": [ + "", + "<|im_end|>" + ] }, { "version": 1, @@ -5755,23 +5191,15 @@ "model_revision": "0ec94d61d30ab161b49c69f9bf92ec2b9986d234" } ], - "prompt_style": { - "style_name": "INTERNLM2", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92542 - ], - "stop": [ - "", - "<|im_end|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542 + ], + "stop": [ + "", + "<|im_end|>" + ] }, { "version": 1, @@ -5822,23 +5250,15 @@ "model_file_name_template": "internlm2_5-7b-chat-1m-{quantization}.gguf" } ], - "prompt_style": { - "style_name": "INTERNLM2", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92542 - ], - "stop": [ - "", - "<|im_end|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542 + ], + "stop": [ + "", + "<|im_end|>" + ] }, { "version":1, @@ -5873,14 +5293,13 @@ "model_revision":"ef62bae5af34be653b9801037cd613e05ab24fdc" } ], - "prompt_style":{ - "style_name":"OmniLMM", - "system_prompt":"The role of first msg should be user", - "roles":[ - "user", - "assistant" - ] - } + "chat_template": "", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version":1, @@ -5915,14 +5334,13 @@ "model_revision":"f92aff28552de35de3be204e8fe292dd4824e544" } ], - "prompt_style":{ - "style_name":"OmniLMM", - "system_prompt":"The role of first msg should be user", - "roles":[ - "user", - "assistant" - ] - } + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", + "stop_token_ids": [ + 128001 + ], + "stop": [ + "<|end_of_text|>" + ] }, { "version":1, @@ -5957,18 +5375,15 @@ "model_revision":"051e2df6505f1fc4305f2c9bd42ed90db8bf4874" } ], - "prompt_style":{ - "style_name":"QWEN", - "system_prompt":"You are a helpful assistant", - "roles":[ - "user", - "assistant" - ], - "stop": [ - "<|im_end|>", - "<|endoftext|>" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 151645, + 151643 + ], + "stop": [ + "<|im_end|>", + "<|endoftext|>" + ] }, { "version": 1, @@ -6003,24 +5418,17 @@ "model_revision": "5d3a5aa033ed2c502300d426c81cc5b13bcd1409" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -6055,18 +5463,17 @@ "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}" } ], - "prompt_style": { - "style_name": "orion", - "roles": [ - "Human", - "assistant" - ], - "stop": [ - "", - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first %}{{ '' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 0 + ], + "stop": [ + "", + "", + "" + ] }, { "version": 1, @@ -6093,18 +5500,17 @@ "model_revision": "eba2e20808407fb431a76b90d5d506e04a0325f2" } ], - "prompt_style": { - "style_name": "orion", - "roles": [ - "Human", - "assistant" - ], - "stop": [ - "", - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first %}{{ '' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 0 + ], + "stop": [ + "", + "", + "" + ] }, { "version": 1, @@ -6139,28 +5545,19 @@ "model_revision": "ea29a9a430f27893e780366dae81d4ca5ebab561" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -6195,17 +5592,17 @@ "model_id": "google/gemma-7b-it" } ], - "prompt_style": { - "style_name": "gemma", - "roles": [ - "user", - "model" - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{{ '' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "stop_token_ids": [ + 1, + 106, + 107 + ], + "stop": [ + "", + "", + "" + ] }, { "version": 1, @@ -6385,17 +5782,17 @@ "model_id": "mlx-community/gemma-2-27b-it-fp16" } ], - "prompt_style": { - "style_name": "gemma", - "roles": [ - "user", - "model" - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{{ '' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "stop_token_ids": [ + 1, + 106, + 107 + ], + "stop": [ + "", + "", + "" + ] }, { "version": 1, @@ -6539,23 +5936,15 @@ "model_revision": "0df19b6e10f1a19ca663f7cc1141aae10f1825f4" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "intra_message_sep": "\n", - "system_prompt": "", - "roles": [ - "USER", - "ASSISTANT" - ], - "stop_token_ids": [ - 100006, - 100007 - ], - "stop": [ - "[CLS]", - "" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}", + "stop_token_ids": [ + 100006, + 100007 + ], + "stop": [ + "[CLS]", + "" + ] }, { "version": 1, @@ -6626,23 +6015,15 @@ "model_revision": "a06fd164c7170714924d2881c61c8348425ebc94" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "intra_message_sep": "\n", - "system_prompt": "", - "roles": [ - "USER", - "ASSISTANT" - ], - "stop_token_ids": [ - 100006, - 100007 - ], - "stop": [ - "[CLS]", - "" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}", + "stop_token_ids": [ + 100006, + 100007 + ], + "stop": [ + "[CLS]", + "" + ] }, { "version": 1, @@ -6666,22 +6047,15 @@ "model_revision": "fe1d74027ebdd81cef5f815fa3a2d432a6b5de2a" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -6705,22 +6079,15 @@ "model_revision": "35b90dd57d977b6e5bc4907986fa5b77aa15a82e" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -6744,22 +6111,15 @@ "model_revision": "f4a3ba49f3f18695945c2a7c12400d4da99da498" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -6783,22 +6143,15 @@ "model_revision": "e7a50289e4f839674cf8d4a5a2ce032ccacf64ac" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -6822,22 +6175,15 @@ "model_revision": "b560a1593779b735a84a6daf72fba96ae38da288" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -7010,20 +6356,15 @@ "model_revision": "35febfc08f723ac0df32480eb4af349a7d08656e" } ], - "prompt_style": { - "style_name": "c4ai-command-r", - "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.", - "roles": [ - "<|USER_TOKEN|>", - "<|CHATBOT_TOKEN|>" - ], - "intra_message_sep": "", - "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>", - "stop_token_ids": [ - 6, - 255001 - ] - } + "chat_template": "{{ '' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}", + "stop_token_ids": [ + 6, + 255001 + ], + "stop": [ + "", + "<|END_OF_TURN_TOKEN|>" + ] }, { "version": 1, @@ -7050,20 +6391,15 @@ "model_revision": "1dddf3b95bc1391f6307299eb1c162c194bde9bd" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "system_prompt": "", - "roles": [ - "GPT4 Correct User", - "GPT4 Correct Assistant" - ], - "intra_message_sep": "<|end_of_turn|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 32000 - ] - } + "chat_template": "ssage in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}", + "stop_token_ids": [ + 2, + 32000 + ], + "stop": [ + "", + "<|end_of_turn|>" + ] }, { "version": 1, @@ -7113,25 +6449,17 @@ "model_revision": "9db32d9127cac0c85961e169d75da57a18a847b1" } ], - "prompt_style": { - "style_name": "INTERNVL", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92543, - 92542 - ], - "stop": [ - "", - "<|im_end|>", - "<|im_start|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542, + 92543 + ], + "stop": [ + "", + "<|im_end|>", + "<|im_start|>" + ] }, { "version": 1, @@ -7270,25 +6598,9 @@ "model_revision": "1bc796bf80f2ebc7d6a14c15f55217a4600d50a4" } ], - "prompt_style": { - "style_name": "INTERNVL", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92543, - 92542 - ], - "stop": [ - "", - "<|im_end|>", - "<|im_start|>" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [], + "stop": [] }, { "version": 1, @@ -7323,24 +6635,15 @@ "model_revision": "7863e362174f4718c2fe9cba4befd0b580a3194f" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -7368,24 +6671,15 @@ "model_revision": "f375ead7d8202ebe2c3d09f1068abdddeb2929fa" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -7449,24 +6743,15 @@ "model_id": "Tele-AI/TeleChat-52B" } ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "<_user>", - "<_bot>" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop": [ - "<_end>", - "<_start>" - ], - "stop_token_ids": [ - 160133, - 160132 - ] - } + "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{%- for message in messages -%}{%- if message['role'] == 'user' -%}{{- '<_user>' + message['content'] +'<_bot>' -}}{%- elif message['role'] == 'assistant' -%}{{- message['content'] + '<_end>' -}}{%- endif -%}{%- endfor -%}", + "stop": [ + "<_end>", + "<_start>" + ], + "stop_token_ids": [ + 160133, + 160132 + ] }, { "version": 1, @@ -7513,21 +6798,12 @@ "model_file_name_template": "csg-wukong-1B-chat-v0.1.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant's response.\n", - "roles": [ - "<|user|>\n", - "<|assistant|>\n" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% elif loop.first %}{{ '<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant\\'s response.\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] } ] diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py index e615a10650..555921f18f 100644 --- a/xinference/model/llm/llm_family.py +++ b/xinference/model/llm/llm_family.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) DEFAULT_CONTEXT_LENGTH = 2048 -BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {} +BUILTIN_LLM_PROMPT_STYLE: Dict[str, Dict[str, Any]] = {} BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set() BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set() BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set() @@ -127,16 +127,6 @@ def validate_model_size_with_radix(cls, v: object) -> object: return v -class PromptStyleV1(BaseModel): - style_name: str - system_prompt: str = "" - roles: List[str] - intra_message_sep: str = "" - inter_message_sep: str = "" - stop: Optional[List[str]] - stop_token_ids: Optional[List[int]] - - class LLMFamilyV1(BaseModel): version: Literal[1] context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH @@ -147,12 +137,12 @@ class LLMFamilyV1(BaseModel): # reason for not required str here: legacy registration model_family: Optional[str] model_specs: List["LLMSpecV1"] - prompt_style: Optional["PromptStyleV1"] + chat_template: Optional[str] + stop_token_ids: Optional[List[int]] + stop: Optional[List[str]] class CustomLLMFamilyV1(LLMFamilyV1): - prompt_style: Optional[Union["PromptStyleV1", str]] # type: ignore - @classmethod def parse_raw( cls: Any, @@ -176,6 +166,11 @@ def parse_raw( except (ValueError, TypeError, UnicodeDecodeError) as e: raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls) llm_spec: CustomLLMFamilyV1 = cls.parse_obj(obj) + vision_model_names: Set[str] = { + family.model_name + for family in BUILTIN_LLM_FAMILIES + if "vision" in family.model_ability + } # check model_family if llm_spec.model_family is None: @@ -183,61 +178,45 @@ def parse_raw( f"You must specify `model_family` when registering custom LLM models." ) assert isinstance(llm_spec.model_family, str) + # TODO: Currently, tool call and vision models cannot be registered if it is not the builtin model_family if ( - llm_spec.model_family != "other" - and "chat" in llm_spec.model_ability - and llm_spec.model_family not in BUILTIN_LLM_MODEL_CHAT_FAMILIES - ): - raise ValueError( - f"`model_family` for chat model must be `other` or one of the following values: \n" - f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}" - ) - if ( - llm_spec.model_family != "other" - and "tools" in llm_spec.model_ability + "tools" in llm_spec.model_ability and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES ): raise ValueError( - f"`model_family` for tool call model must be `other` or one of the following values: \n" + f"`model_family` for tool call model must be one of the following values: \n" f"{', '.join(list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES))}" ) if ( - llm_spec.model_family != "other" - and "chat" not in llm_spec.model_ability - and llm_spec.model_family not in BUILTIN_LLM_MODEL_GENERATE_FAMILIES + "vision" in llm_spec.model_ability + and llm_spec.model_family not in vision_model_names ): raise ValueError( - f"`model_family` for generate model must be `other` or one of the following values: \n" - f"{', '.join(list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES))}" + f"`model_family` for multimodal model must be one of the following values: \n" + f"{', '.join(list(vision_model_names))}" ) - # set prompt style when it is the builtin model family + # set chat_template when it is the builtin model family + if llm_spec.chat_template is None and "chat" in llm_spec.model_ability: + llm_spec.chat_template = llm_spec.model_family + + # handle chat_template when user choose existing model_family if ( - llm_spec.prompt_style is None - and llm_spec.model_family != "other" - and "chat" in llm_spec.model_ability + llm_spec.chat_template is not None + and llm_spec.chat_template in BUILTIN_LLM_PROMPT_STYLE ): - llm_spec.prompt_style = llm_spec.model_family - - # handle prompt style when user choose existing style - if llm_spec.prompt_style is not None and isinstance(llm_spec.prompt_style, str): - prompt_style_name = llm_spec.prompt_style - if prompt_style_name not in BUILTIN_LLM_PROMPT_STYLE: - raise ValueError( - f"Xinference does not support the prompt style name: {prompt_style_name}" - ) - llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name] + llm_spec.stop_token_ids = BUILTIN_LLM_PROMPT_STYLE[llm_spec.chat_template][ + "stop_token_ids" + ] + llm_spec.stop = BUILTIN_LLM_PROMPT_STYLE[llm_spec.chat_template]["stop"] + llm_spec.chat_template = BUILTIN_LLM_PROMPT_STYLE[llm_spec.chat_template][ + "chat_template" + ] # check model ability, registering LLM only provides generate and chat # but for vision models, we add back the abilities so that # gradio chat interface can be generated properly if ( - llm_spec.model_family != "other" - and llm_spec.model_family - in { - family.model_name - for family in BUILTIN_LLM_FAMILIES - if "vision" in family.model_ability - } + llm_spec.model_family in vision_model_names and "vision" not in llm_spec.model_ability ): llm_spec.model_ability.append("vision") diff --git a/xinference/model/llm/llm_family_csghub.json b/xinference/model/llm/llm_family_csghub.json index dc5b9d3ba8..d607b580b7 100644 --- a/xinference/model/llm/llm_family_csghub.json +++ b/xinference/model/llm/llm_family_csghub.json @@ -43,25 +43,17 @@ "model_hub": "csghub" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -85,21 +77,12 @@ "model_hub": "csghub" } ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant's response.\n", - "roles": [ - "<|user|>\n", - "<|assistant|>\n" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% elif loop.first %}{{ '<|system|>\nYou are a creative super artificial intelligence assistant, possessing all the knowledge of humankind. Your name is csg-wukong, developed by OpenCSG. You need to understand and infer the true intentions of users based on the topics discussed in the chat history, and respond to user questions correctly as required. You enjoy responding to users with accurate and insightful answers. Please pay attention to the appropriate style and format when replying, try to avoid repetitive words and sentences, and keep your responses as concise and profound as possible. You carefully consider the context of the discussion when replying to users. When the user says \"continue,\" please proceed with the continuation of the previous assistant\\'s response.\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 44ac3e7794..49e3bdabe3 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -70,19 +70,11 @@ "model_revision": "v1.0.1" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] <>\nYou are a helpful AI assistant.\n<>\n\n", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": " ", - "stop_token_ids": [ + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<>\n' + messages[0]['content'] | trim + '\n<>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ 2 - ] - } + ], + "stop": [] }, { "version": 1, @@ -175,24 +167,15 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -367,24 +350,15 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{{- '<|begin_of_text|>' }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -449,20 +423,12 @@ "model_revision": "v1.0.3" } ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "", - "roles": [ - "", - "" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 195 - ] - } + "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}\n\n{% for message in messages %}\n{% if message['role'] == 'user' %}\n\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% elif message['role'] == 'assistant' %}\n\n{{ message['content']|trim -}}\n{% if not loop.last %}\n\n\n{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n\n{% endif %}", + "stop_token_ids": [ + 2, + 195 + ], + "stop": [] }, { "version": 1, @@ -503,139 +469,6 @@ } ] }, - { - "version": 1, - "context_length": 8192, - "model_name": "chatglm3", - "model_lang": [ - "en", - "zh" - ], - "model_ability": [ - "chat", - "tools" - ], - "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 6, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_hub": "modelscope", - "model_id": "ZhipuAI/chatglm3-6b", - "model_revision": "v1.0.2" - } - ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 64795, - 64797, - 2 - ], - "stop": [ - "<|user|>", - "<|observation|>" - ] - } - }, - { - "version": 1, - "context_length": 32768, - "model_name": "chatglm3-32k", - "model_lang": [ - "en", - "zh" - ], - "model_ability": [ - "chat" - ], - "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 6, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_hub": "modelscope", - "model_id": "ZhipuAI/chatglm3-6b-32k", - "model_revision": "master" - } - ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 64795, - 64797, - 2 - ], - "stop": [ - "<|user|>", - "<|observation|>" - ] - } - }, - { - "version": 1, - "context_length": 131072, - "model_name": "chatglm3-128k", - "model_lang": [ - "en", - "zh" - ], - "model_ability": [ - "chat" - ], - "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 6, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_hub": "modelscope", - "model_id": "ZhipuAI/chatglm3-6b-128k", - "model_revision": "master" - } - ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 64795, - 64797, - 2 - ], - "stop": [ - "<|user|>", - "<|observation|>" - ] - } - }, { "version": 1, "context_length": 131072, @@ -690,24 +523,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "[gMASK]{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -763,24 +589,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "[gMASK]{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -809,24 +628,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -869,24 +681,17 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "CHATGLM3", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151329, - 151336, - 151338 - ], - "stop": [ - "<|endoftext|>", - "<|user|>", - "<|observation|>" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|>\n' + item['content'] }}{% elif loop.first %}{{ '<|system|>\n你是一位智能编程助手,你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题,并提供格式规范、可以执行、准确安全的代码,并在必要时提供详细的解释。' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|>\n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|>\n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ] }, { "version": 1, @@ -926,14 +731,13 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "XVERSE", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|system|> \n' + item['content'] }}{% endif %}{% if item['role'] == 'user' %}{{ '<|user|> \n' + item['content'] }}{% elif item['role'] == 'assistant' %}{{ '<|assistant|> \n' + item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}", + "stop_token_ids": [ + 3 + ], + "stop": [ + "<|endoftext|>" + ] }, { "version": 1, @@ -1045,23 +849,15 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "INTERNLM2", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92542 - ], - "stop": [ - "", - "<|im_end|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542 + ], + "stop": [ + "", + "<|im_end|>" + ] }, { "version": 1, @@ -1086,23 +882,15 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "INTERNLM2", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92542 - ], - "stop": [ - "", - "<|im_end|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542 + ], + "stop": [ + "", + "<|im_end|>" + ] }, { "version": 1, @@ -1140,18 +928,13 @@ "model_revision": "v1.0.0" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.", - "roles": [ - "Instruction", - "Response" - ], - "intra_message_sep": "\n\n### ", - "stop": [ - "" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -1252,24 +1035,15 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "CodeShell", - "system_prompt": "", - "roles": [ - "## human:", - "## assistant: " - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 70000 - ], - "stop": [ - "<|endoftext|>", - "|||", - "||" - ] - } + "chat_template": "{% for item in messages %}{% if item['role'] == 'user' %}{{ '## human: ' + item['content'] + '||' }}{% elif item['role'] == 'assistant' %}{{ '## assistant: ' + item['content'] + '||' }}{% endif %}{% endfor %}{{ '## assistant: ' }}", + "stop_token_ids": [ + 70000 + ], + "stop": [ + "<|endoftext|>", + "|||", + "||" + ] }, { "version": 1, @@ -1353,19 +1127,13 @@ "model_revision": "v0.1.0" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] <>\nWrite code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n<>\n\n", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": " ", - "stop_token_ids": [ + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = '<>\n' + messages[0]['content'] | trim + '\n<>\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ 2 - ] - } + ], + "stop": [ + "" + ] }, { "version": 1, @@ -1567,16 +1335,13 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "MIXTRAL_V01", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "", - "inter_message_sep": "" - } + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + ''}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -1716,28 +1481,19 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -1900,28 +1656,19 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -1961,28 +1708,19 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -2009,15 +1747,13 @@ "model_revision": "v1.0.0" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE_COT", - "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.", - "roles": [ - "Instruction", - "Response" - ], - "intra_message_sep": "\n\n### " - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n\n### ' }}{% elif loop.first %}{{ 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### ' }}{% endif %}{% if item['role'] == 'user' %}{{ 'Instruction: ' + item['content'] + '\n\n### ' }}{% elif item['role'] == 'assistant' %}{{ 'Response: ' + item['content'] + '\n\n### ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Response: Let\\'s think step by step.' }}{% endif %}", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -2044,22 +1780,13 @@ "model_revision": "v1.0.0" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] ", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + ''}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -2095,22 +1822,13 @@ "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf" } ], - "prompt_style": { - "style_name": "LLAMA2", - "system_prompt": "[INST] ", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": " ", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + ''}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -2151,22 +1869,13 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "mistral-nemo", - "system_prompt": "", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + '' }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + ''}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -2208,106 +1917,13 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "mistral-nemo", - "system_prompt": "", - "roles": [ - "[INST]", - "[/INST]" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } - }, - { - "version": 1, - "context_length": 8192, - "model_name": "zephyr-7b-alpha", - "model_lang": [ - "en" - ], - "model_ability": [ - "chat" - ], - "model_description": "Zephyr-7B-α is the first model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 7, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_hub": "modelscope", - "model_id": "keepitsimple/zephyr-7b-alpha", - "model_revision": "v1.0-1" - } - ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "<|system|>\nYou are a friendly chatbot.\n", - "roles": [ - "<|user|>\n", - "<|assistant|>\n" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } - }, - { - "version": 1, - "context_length": 8192, - "model_name": "zephyr-7b-beta", - "model_lang": [ - "en" - ], - "model_ability": [ - "chat" + "chat_template": "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- '' }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\n\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + '' }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + ''}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n", + "stop_token_ids": [ + 2 ], - "model_description": "Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 7, - "quantizations": [ - "4-bit", - "8-bit", - "none" - ], - "model_hub": "modelscope", - "model_id": "modelscope/zephyr-7b-beta", - "model_revision": "master" - } - ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "<|system|>\nYou are a friendly chatbot.\n", - "roles": [ - "<|user|>\n", - "<|assistant|>\n" - ], - "intra_message_sep": "\n", - "inter_message_sep": "\n", - "stop_token_ids": [ - 2 - ], - "stop": [ - "" - ] - } + "stop": [ + "" + ] }, { "version": 1, @@ -2318,8 +1934,7 @@ "zh" ], "model_ability": [ - "chat", - "tools" + "chat" ], "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.", "model_specs": [ @@ -2438,25 +2053,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ '<|im_start|>system\n' + item['content'] + '<|im_end|>\n' }}{% elif loop.first %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{% if item['role'] == 'user' %}{{ '<|im_start|>user\n' + item['content'] + '<|im_end|>' }}{% elif item['role'] == 'assistant' %}{{ '<|im_start|>assistant\n' + item['content'] + '<|im_end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2832,25 +2439,17 @@ } } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2887,25 +2486,17 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -2984,25 +2575,17 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -3281,25 +2864,17 @@ } } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -3365,25 +2940,17 @@ } } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" \" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within XML tags as follows:\n\" }}\n {{- \"\n\" }}\n {{- '{\"name\": , \"arguments\": }\n' }}\n {{- '<|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n<|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -3418,19 +2985,13 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "DEEPSEEK_CHAT", - "system_prompt": "<|begin▁of▁sentence|>", - "roles": [ - "User", - "Assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|end▁of▁sentence|>", - "stop": [ - "<|end▁of▁sentence|>" - ] - } + "chat_template": "", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] }, { "version": 1, @@ -3505,19 +3066,13 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "DEEPSEEK_CHAT", - "system_prompt": "<|begin▁of▁sentence|>", - "roles": [ - "User", - "Assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|end▁of▁sentence|>", - "stop": [ - "<|end▁of▁sentence|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] }, { "version": 1, @@ -3614,18 +3169,13 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "DEEPSEEK_CODER", - "system_prompt": "You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.", - "roles": [ - "### Instruction:", - "### Response:" - ], - "inter_message_sep": "\n", - "stop": [ - "<|EOT|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{'<|begin▁of▁sentence|>'}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\n' + message['content'] + '\n'}}\n {%- else %}\n{{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}", + "stop_token_ids": [ + 32021 + ], + "stop": [ + "<|EOT|>" + ] }, { "version": 1, @@ -3713,23 +3263,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "INTERNLM2", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92542 - ], - "stop": [ - "", - "<|im_end|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542 + ], + "stop": [ + "", + "<|im_end|>" + ] }, { "version": 1, @@ -3766,24 +3308,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 151643, - 151644, - 151645 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - } + "chat_template": "", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -3819,18 +3354,17 @@ "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}" } ], - "prompt_style": { - "style_name": "orion", - "roles": [ - "Human", - "assistant" - ], - "stop": [ - "", - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first %}{{ '' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 0 + ], + "stop": [ + "", + "", + "" + ] }, { "version": 1, @@ -3857,18 +3391,17 @@ "model_id": "OrionStarAI/Orion-14B-Chat-RAG" } ], - "prompt_style": { - "style_name": "orion", - "roles": [ - "Human", - "assistant" - ], - "stop": [ - "", - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first %}{{ '' }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + '' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 0 + ], + "stop": [ + "", + "", + "" + ] }, { "version": 1, @@ -3903,28 +3436,19 @@ "model_id": "01ai/Yi-VL-34B" } ], - "prompt_style": { - "style_name": "CHATML", - "system_prompt": "", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "inter_message_sep": "", - "stop_token_ids": [ - 2, - 6, - 7, - 8 - ], - "stop": [ - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|im_sep|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] }, { "version": 1, @@ -3961,17 +3485,17 @@ "model_id": "AI-ModelScope/gemma-7b-it" } ], - "prompt_style": { - "style_name": "gemma", - "roles": [ - "user", - "model" - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{{ '' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "stop_token_ids": [ + 1, + 106, + 107 + ], + "stop": [ + "", + "", + "" + ] }, { "version": 1, @@ -4042,17 +3566,17 @@ "model_hub": "modelscope" } ], - "prompt_style": { - "style_name": "gemma", - "roles": [ - "user", - "model" - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{{ '' }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "stop_token_ids": [ + 1, + 106, + 107 + ], + "stop": [ + "", + "", + "" + ] }, { "version":1, @@ -4089,14 +3613,13 @@ "model_revision":"master" } ], - "prompt_style":{ - "style_name":"OmniLMM", - "system_prompt":"The role of first msg should be user", - "roles":[ - "user", - "assistant" - ] - } + "chat_template": "", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] }, { "version": 1, @@ -4121,22 +3644,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -4161,22 +3677,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -4201,22 +3710,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -4241,22 +3743,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version": 1, @@ -4281,22 +3776,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "MINICPM-2B", - "system_prompt": "", - "roles": [ - "user", - "assistant" - ], - "stop_token_ids": [ - 1, - 2 - ], - "stop": [ - "", - "" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2 + ], + "stop": [ + "", + "" + ] }, { "version":1, @@ -4333,14 +3821,13 @@ "model_revision":"master" } ], - "prompt_style":{ - "style_name":"OmniLMM", - "system_prompt":"The role of first msg should be user", - "roles":[ - "user", - "assistant" - ] - } + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", + "stop_token_ids": [ + 128001 + ], + "stop": [ + "<|end_of_text|>" + ] }, { "version":1, @@ -4377,14 +3864,15 @@ "model_revision":"master" } ], - "prompt_style":{ - "style_name":"QWEN", - "system_prompt":"You are a helpful assistant", - "roles":[ - "user", - "assistant" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 151645, + 151643 + ], + "stop": [ + "<|im_end|>", + "<|endoftext|>" + ] }, { "version": 1, @@ -4463,23 +3951,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "intra_message_sep": "\n", - "system_prompt": "", - "roles": [ - "USER", - "ASSISTANT" - ], - "stop_token_ids": [ - 100006, - 100007 - ], - "stop": [ - "[CLS]", - "" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}", + "stop_token_ids": [ + 100006, + 100007 + ], + "stop": [ + "[CLS]", + "" + ] }, { "version": 1, @@ -4504,23 +3984,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "intra_message_sep": "\n", - "system_prompt": "", - "roles": [ - "USER", - "ASSISTANT" - ], - "stop_token_ids": [ - 100006, - 100007 - ], - "stop": [ - "[CLS]", - "" - ] - } + "chat_template": "{% for item in messages %}{% if loop.first and item['role'] == 'system' %}{{ item['content'] + '\n' }}{% endif %}{% if item['role'] == 'user' %}{{ 'USER: ' + item['content'] + '\n' }}{% elif item['role'] == 'assistant' %}{{ 'ASSISTANT: ' + item['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: ' }}{% endif %}", + "stop_token_ids": [ + 100006, + 100007 + ], + "stop": [ + "[CLS]", + "" + ] }, { "version": 1, @@ -4588,20 +4060,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "c4ai-command-r", - "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.", - "roles": [ - "<|USER_TOKEN|>", - "<|CHATBOT_TOKEN|>" - ], - "intra_message_sep": "", - "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>", - "stop_token_ids": [ - 6, - 255001 - ] - } + "chat_template": "{{ '' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}", + "stop_token_ids": [ + 6, + 255001 + ], + "stop": [ + "", + "<|END_OF_TURN_TOKEN|>" + ] }, { "version": 1, @@ -4628,24 +4095,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "PHI3", - "system_prompt": "You are a helpful AI assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "inter_message_sep": "<|end|>\n", - "stop_token_ids":[ - 32000, - 32007 - ], - "stop": [ - "<|endoftext|>", - "<|end|>" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}", + "stop_token_ids":[ + 32000, + 32001, + 32007 + ], + "stop": [ + "<|endoftext|>", + "<|assistant|>", + "<|end|>" + ] }, { "version": 1, @@ -4672,24 +4132,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "PHI3", - "system_prompt": "You are a helpful AI assistant.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n", - "inter_message_sep": "<|end|>\n", - "stop_token_ids":[ - 32000, - 32007 - ], - "stop": [ - "<|endoftext|>", - "<|end|>" - ] - } + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ '<|endoftext|>' }}{% endif %}", + "stop_token_ids":[ + 32000, + 32001, + 32007 + ], + "stop": [ + "<|endoftext|>", + "<|assistant|>", + "<|end|>" + ] }, { "version": 1, @@ -4718,25 +4171,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "INTERNVL", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92543, - 92542 - ], - "stop": [ - "", - "<|im_end|>", - "<|im_start|>" - ] - } + "chat_template": "{{ '' }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 2, + 92542, + 92543 + ], + "stop": [ + "", + "<|im_end|>", + "<|im_start|>" + ] }, { "version": 1, @@ -4888,25 +4333,17 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "INTERNVL", - "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).", - "roles": [ - "<|im_start|>user", - "<|im_start|>assistant" - ], - "intra_message_sep": "<|im_end|>", - "stop_token_ids": [ - 2, - 92543, - 92542 - ], - "stop": [ - "", - "<|im_end|>", - "<|im_start|>" - ] - } + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] }, { "version": 1, @@ -4943,24 +4380,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -4989,24 +4417,15 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "LLAMA3", - "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", - "roles": [ - "user", - "assistant" - ], - "intra_message_sep": "\n\n", - "inter_message_sep": "<|eot_id|>", - "stop_token_ids": [ - 128001, - 128009 - ], - "stop": [ - "<|end_of_text|>", - "<|eot_id|>" - ] - } + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] }, { "version": 1, @@ -5080,23 +4499,14 @@ "model_revision": "master" } ], - "prompt_style": { - "style_name": "NO_COLON_TWO", - "system_prompt": "You are a helpful assistant.", - "roles": [ - "<_user>", - "<_bot>" - ], - "intra_message_sep": "", - "inter_message_sep": "", - "stop": [ - "<_end>", - "<_start>" - ], - "stop_token_ids": [ - 160133, - 160132 - ] - } + "chat_template": "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{%- for message in messages -%}{%- if message['role'] == 'user' -%}{{- '<_user>' + message['content'] +'<_bot>' -}}{%- elif message['role'] == 'assistant' -%}{{- message['content'] + '<_end>' -}}{%- endif -%}{%- endfor -%}", + "stop": [ + "<_end>", + "<_start>" + ], + "stop_token_ids": [ + 160133, + 160132 + ] } ] diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py index 22fbd53e72..8df9207a95 100644 --- a/xinference/model/llm/lmdeploy/core.py +++ b/xinference/model/llm/lmdeploy/core.py @@ -12,25 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import time import uuid from typing import AsyncGenerator, Dict, Iterator, List, Optional, TypedDict, Union import torch -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionChunkChoice, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionUsage, - LoRA, -) +from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA from ..core import LLM from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import ChatModelMixin +from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk logger = logging.getLogger(__name__) @@ -74,8 +64,8 @@ class LMDeployGenerateConfig(TypedDict, total=False): repetition_penalty: Optional[float] ignore_eos: Optional[bool] random_seed: Optional[int] - stop_words: Optional[List[str]] - bad_words: Optional[List[str]] + stop_words: Optional[List[int]] + bad_words: Optional[List[int]] min_new_tokens: Optional[int] skip_special_tokens: Optional[bool] logprobs: Optional[int] @@ -164,9 +154,6 @@ def load(self): raise ValueError(f"Can not find correct chat template.") chat_template_config = ChatTemplateConfig(chat_temp_name) - chat_template_config.meta_instruction = ( - self.model_family.prompt_style.system_prompt - ) count = torch.cuda.device_count() if count > 1: self._model_config.setdefault("tp", torch.cuda.device_count()) @@ -192,9 +179,7 @@ def match( async def async_chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]: stream = ( @@ -213,75 +198,69 @@ async def async_chat( else False ) - chat_history = chat_history or [] - if stream: - chunk = self._chat_stream(prompt, chat_history, include_usage) + chunk = self._chat_stream(messages, include_usage) return self._async_to_chat_completion_chunks(chunk) else: - chunk = await self._chat(prompt, chat_history) - return self._to_chat_completion(chunk) + return await self._chat(messages) - async def _chat_stream(self, prompt, chat_history, include_usage): + async def _chat_stream(self, messages, include_usage): from lmdeploy.messages import Response prompt_tokens, completion_tokens, total_tokens = 0, 0, 0 completion_id = str(uuid.uuid1()) + finish_reason = None async for output in self._generate( - prompt, - chat_history, + messages, session_id=-1, stream_response=True, ): new_text = output.text if isinstance(output, Response) else output.response - - completion_choice = ChatCompletionChunkChoice( - text=new_text, - index=0, - logprobs=None, - finish_reason=output.finish_reason, - ) - chunk = ChatCompletionChunk( - id=completion_id, - object="chat.completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) prompt_tokens = output.input_token_len completion_tokens = output.generate_token_len total_tokens = prompt_tokens + completion_tokens - completion_usage = CompletionUsage( + finish_reason = output.finish_reason + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, ) - chunk["usage"] = completion_usage - print(chunk) - yield chunk + + yield generate_completion_chunk( + chunk_text=None, + finish_reason=finish_reason, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + has_choice=True, + has_content=False, + ) if include_usage: - chunk = ChatCompletionChunk( - id=completion_id, - object="chat.completion", - created=int(time.time()), - model=self.model_uid, - choices=[], - ) - chunk["usage"] = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=False, + has_content=False, ) - yield chunk - async def _chat(self, prompt, chat_history): + async def _chat(self, messages) -> ChatCompletion: from lmdeploy.messages import Response - response, finish_reason = "", "" + response, finish_reason = "", None prompt_tokens, completion_tokens, total_tokens = 0, 0, 0 async for output in self._generate( - prompt, - chat_history, + messages, session_id=-1, stream_response=False, ): @@ -291,30 +270,20 @@ async def _chat(self, prompt, chat_history): total_tokens = output.input_token_len + output.generate_token_len finish_reason = output.finish_reason - chunk = ChatCompletion( - id=str(uuid.uuid1()), - object="chat.completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=response, finish_reason=finish_reason, logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=total_tokens, - ), + return generate_chat_completion( + self.model_uid, + response, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + finish_reason=finish_reason, ) - return chunk # copy from lmdeploy # Reference: lmdeploy.serve.async_engine.py async def _generate( self, - prompt, - chat_history, + messages: List[Dict], session_id: int, generate_config: Optional[Dict] = None, tools: Optional[List[object]] = None, @@ -332,6 +301,8 @@ async def _generate( from lmdeploy.serve.async_engine import GenOut from lmdeploy.tokenizer import DetokenizeState + from ..utils import get_stop_token_ids_from_config_file + session_id = -1 if str(session_id) not in self._model.id2step: @@ -343,7 +314,9 @@ async def _generate( generate_config, self._model.tokenizer ) if generate_config.stop_words is None: # type: ignore - generate_config.stop_words = self._model.stop_words # type: ignore + stop_token_ids = get_stop_token_ids_from_config_file(self.model_path) + if stop_token_ids is not None: + generate_config.stop_words = stop_token_ids # type: ignore if generate_config.random_seed is None and sequence_start: # type: ignore generate_config.random_seed = random.getrandbits(64) # type: ignore if generate_config.n > 1: # type: ignore @@ -353,7 +326,7 @@ async def _generate( ) generate_config.n = 1 # type: ignore - prompt_input = await self._get_prompt_input(prompt, chat_history) + prompt_input = await self._get_prompt_input(messages) prompt = prompt_input["prompt"] input_ids = prompt_input["input_ids"] finish_reason = None @@ -482,8 +455,7 @@ async def _generate( # Reference: lmdeploy.serve.vl_async_engine.py async def _get_prompt_input( self, - prompt: Union[str, List[Dict]], - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], sequence_start: bool = True, tools: Optional[List[object]] = None, **kwargs, @@ -493,13 +465,9 @@ async def _get_prompt_input( IMAGE_DUMMY_TOKEN_INDEX = 0 import numpy as np - assert self.model_family.prompt_style is not None - prompt_style = self.model_family.prompt_style.copy() - chat_history = chat_history or [] - - decorated, _ = self.get_prompt(prompt, chat_history, prompt_style) # type: ignore - chat_history.append(ChatCompletionMessage(role="user", content=prompt)) # type: ignore - prompt = chat_history # type: ignore + model_family = self.model_family.model_family or self.model_family.model_name + decorated, _ = self.get_specific_prompt(model_family, messages) # type: ignore + prompt = messages # type: ignore decorated = decorated.replace("", "") diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index e41db2b693..07966fcbba 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -17,13 +17,12 @@ import sys import time import uuid -from typing import Dict, Iterable, Iterator, List, Optional, TypedDict, Union +from typing import Dict, Iterator, List, Optional, TypedDict, Union from ....fields import max_tokens_field from ....types import ( ChatCompletion, ChatCompletionChunk, - ChatCompletionMessage, Completion, CompletionChoice, CompletionChunk, @@ -32,7 +31,7 @@ ) from ..core import LLM from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import ChatModelMixin +from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin, generate_completion_chunk logger = logging.getLogger(__name__) @@ -54,6 +53,7 @@ class MLXGenerateConfig(TypedDict, total=False): stop_token_ids: Optional[Union[int, List[int]]] stream: bool stream_options: Optional[Union[dict, None]] + tools: Optional[List[Dict]] class MLXModel(LLM): @@ -238,29 +238,35 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig): else: finish_reason = "stop" - if stream: - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason=finish_reason - ) - else: - completion_choice = CompletionChoice( - text=output, index=0, logprobs=None, finish_reason=finish_reason - ) - - completion_chunk = CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=model_uid, - choices=[completion_choice], - ) completion_usage = CompletionUsage( prompt_tokens=input_echo_len, completion_tokens=i, total_tokens=(input_echo_len + i), ) - - yield completion_chunk, completion_usage + if stream: + yield generate_completion_chunk( + None, + finish_reason=finish_reason, + chunk_id=chunk_id, + model_uid=model_uid, + prompt_tokens=input_echo_len, + completion_tokens=i, + total_tokens=(input_echo_len + i), + has_choice=True, + has_content=False, + ), completion_usage + else: + yield generate_completion_chunk( + output, + finish_reason=finish_reason, + chunk_id=chunk_id, + model_uid=model_uid, + prompt_tokens=input_echo_len, + completion_tokens=i, + total_tokens=(input_echo_len + i), + has_choice=True, + has_content=True, + ), completion_usage if include_usage: completion_chunk = CompletionChunk( @@ -270,11 +276,6 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig): model=model_uid, choices=[], ) - completion_usage = CompletionUsage( - prompt_tokens=input_echo_len, - completion_tokens=i, - total_tokens=(input_echo_len + i), - ) yield completion_chunk, completion_usage def generate( @@ -345,20 +346,13 @@ def _sanitize_generate_config( generate_config: Optional[MLXGenerateConfig], ) -> MLXGenerateConfig: generate_config = super()._sanitize_generate_config(generate_config) - if ( - (not generate_config.get("stop")) - and self.model_family.prompt_style - and self.model_family.prompt_style.stop - ): - generate_config["stop"] = self.model_family.prompt_style.stop.copy() + if (not generate_config.get("stop")) and self.model_family.stop: + generate_config["stop"] = self.model_family.stop.copy() if ( generate_config.get("stop_token_ids", None) is None - and self.model_family.prompt_style - and self.model_family.prompt_style.stop_token_ids + and self.model_family.stop_token_ids ): - generate_config[ - "stop_token_ids" - ] = self.model_family.prompt_style.stop_token_ids.copy() + generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy() return generate_config @@ -377,28 +371,19 @@ def match( def chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[MLXGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - tools = generate_config.pop("tools", []) if generate_config else None # type: ignore - full_prompt = self.get_full_prompt( - self.model_family, prompt, system_prompt, chat_history, tools + model_family = self.model_family.model_family or self.model_family.model_name + tools = generate_config.pop("tools", []) if generate_config else None + full_context_kwargs = {} + if tools and model_family in QWEN_TOOL_CALL_FAMILY: + full_context_kwargs["tools"] = tools + full_prompt = self.get_full_context( + messages, self.model_family.chat_template, **full_context_kwargs ) generate_config = self._sanitize_generate_config(generate_config) - # TODO(codingl2k1): qwen hacky to set stop for function call. - model_family = self.model_family.model_family or self.model_family.model_name - if tools and model_family in ["qwen-chat", "qwen1.5-chat"]: - stop = generate_config.get("stop") - if isinstance(stop, str): - generate_config["stop"] = [stop, "Observation:"] - elif isinstance(stop, Iterable): - assert not isinstance(stop, str) - generate_config["stop"] = list(stop) + ["Observation:"] - else: - generate_config["stop"] = "Observation:" stream = generate_config.get("stream", False) if stream: @@ -409,7 +394,5 @@ def chat( c = self.generate(full_prompt, generate_config) assert not isinstance(c, Iterator) if tools: - return self._tool_calls_completion( - self.model_family, self.model_uid, c, tools - ) + return self._tool_calls_completion(self.model_family, self.model_uid, c) return self._to_chat_completion(c) diff --git a/xinference/model/llm/mlx/tests/test_mlx.py b/xinference/model/llm/mlx/tests/test_mlx.py index 4fe69fd34f..b1d0682e5b 100644 --- a/xinference/model/llm/mlx/tests/test_mlx.py +++ b/xinference/model/llm/mlx/tests/test_mlx.py @@ -36,6 +36,7 @@ def test_load_mlx(setup): ) assert len(client.list_models()) == 1 model = client.get_model(model_uid) - completion = model.chat("write a poem.") + messages = [{"role": "user", "content": "write a poem."}] + completion = model.chat(messages) assert "content" in completion["choices"][0]["message"] assert len(completion["choices"][0]["message"]["content"]) != 0 diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 3c31b4fe7a..7d2566ee27 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -21,7 +21,6 @@ from ....types import ( ChatCompletion, ChatCompletionChunk, - ChatCompletionMessage, Completion, CompletionChoice, CompletionChunk, @@ -29,7 +28,7 @@ ) from .. import LLM, LLMFamilyV1, LLMSpecV1 from ..llm_family import CustomLLMFamilyV1 -from ..utils import ChatModelMixin +from ..utils import ChatModelMixin, generate_completion_chunk logger = logging.getLogger(__name__) @@ -346,12 +345,14 @@ async def async_generate( async def stream_results() -> AsyncGenerator[CompletionChunk, None]: prompt_tokens, completion_tokens, total_tokens = 0, 0, 0 + finish_reason = None async for meta_info, out in self._stream_generate( prompt, **sanitized_generate_config ): chunk = self._convert_state_to_completion_chunk( request_id, self.model_uid, output_text=out ) + finish_reason = meta_info["finish_reason"] prompt_tokens = meta_info["prompt_tokens"] completion_tokens = meta_info["completion_tokens"] total_tokens = prompt_tokens + completion_tokens @@ -361,6 +362,28 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]: total_tokens=total_tokens, ) yield chunk + + finish_reason = ( + "stop" + if finish_reason is None + or ( + isinstance(finish_reason, str) + and finish_reason.lower() == "none" + ) + else finish_reason + ) + yield generate_completion_chunk( + None, + finish_reason=finish_reason, + chunk_id=request_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + has_choice=True, + has_content=False, + ) + if include_usage: chunk = CompletionChunk( id=request_id, @@ -409,26 +432,17 @@ def _sanitize_chat_config( ) -> Dict: if not generate_config: generate_config = {} - if self.model_family.prompt_style: - if ( - not generate_config.get("stop") - ) and self.model_family.prompt_style.stop: - generate_config["stop"] = self.model_family.prompt_style.stop.copy() + if self.model_family.stop: + if (not generate_config.get("stop")) and self.model_family.stop: + generate_config["stop"] = self.model_family.stop.copy() return generate_config async def async_chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]: - assert self.model_family.prompt_style is not None - prompt_style = self.model_family.prompt_style.copy() - if system_prompt: - prompt_style.system_prompt = system_prompt - chat_history = chat_history or [] - full_prompt = self.get_prompt(prompt, chat_history, prompt_style) + full_prompt = self.get_full_context(messages, self.model_family.chat_template) generate_config = self._sanitize_chat_config(generate_config) stream = generate_config.get("stream", None) diff --git a/xinference/model/llm/tests/test_llm_family.py b/xinference/model/llm/tests/test_llm_family.py index 252491282c..146f00dd6f 100644 --- a/xinference/model/llm/tests/test_llm_family.py +++ b/xinference/model/llm/tests/test_llm_family.py @@ -26,7 +26,6 @@ CustomLLMFamilyV1, LlamaCppLLMSpecV1, LLMFamilyV1, - PromptStyleV1, PytorchLLMSpecV1, _generate_meta_file, _get_cache_dir, @@ -70,15 +69,9 @@ def test_deserialize_llm_family_v1(): "model_id":"example/TestModel" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", - "roles": ["user", "assistant"], - "intra_message_sep": "\\n### ", - "inter_message_sep": "\\n### ", - "stop": null, - "stop_token_ids": null - } + "chat_template": "xyz", + "stop_token_ids": [1, 2, 3], + "stop": ["hello", "world"] }""" model_family = LLMFamilyV1.parse_raw(serialized) assert isinstance(model_family, LLMFamilyV1) @@ -108,17 +101,9 @@ def test_deserialize_llm_family_v1(): assert pytorch_spec.model_hub == "huggingface" assert pytorch_spec.model_id == "example/TestModel" - prompt_style = PromptStyleV1( - style_name="ADD_COLON_SINGLE", - system_prompt=( - "A chat between a curious human and an artificial intelligence assistant. The " - "assistant gives helpful, detailed, and polite answers to the human's questions." - ), - roles=["user", "assistant"], - intra_message_sep="\n### ", - inter_message_sep="\n### ", - ) - assert prompt_style == model_family.prompt_style + assert model_family.chat_template == "xyz" + assert model_family.stop_token_ids == [1, 2, 3] + assert model_family.stop == ["hello", "world"] def test_serialize_llm_family_v1(): @@ -139,16 +124,6 @@ def test_serialize_llm_family_v1(): model_id="example/TestModel", model_revision="456", ) - prompt_style = PromptStyleV1( - style_name="ADD_COLON_SINGLE", - system_prompt=( - "A chat between a curious human and an artificial intelligence assistant. The " - "assistant gives helpful, detailed, and polite answers to the human's questions." - ), - roles=["user", "assistant"], - intra_message_sep="\n### ", - inter_message_sep="\n### ", - ) llm_family = LLMFamilyV1( version=1, model_type="LLM", @@ -156,10 +131,12 @@ def test_serialize_llm_family_v1(): model_lang=["en"], model_ability=["embed", "generate"], model_specs=[gguf_spec, pytorch_spec], - prompt_style=prompt_style, + chat_template="xyz", + stop_token_ids=[1, 2, 3], + stop=["hello", "world"], ) - expected = """{"version": 1, "context_length": 2048, "model_name": "TestModel", "model_lang": ["en"], "model_ability": ["embed", "generate"], "model_description": null, "model_family": null, "model_specs": [{"model_format": "ggufv2", "model_hub": "huggingface", "model_size_in_billions": 2, "quantizations": ["q4_0", "q4_1"], "quantization_parts": {"q4_2": ["a", "b"]}, "model_id": "example/TestModel", "model_revision": "123", "model_file_name_template": "TestModel.{quantization}.bin", "model_file_name_split_template": "TestModel.{quantization}.bin.{part}", "model_uri": null}, {"model_format": "pytorch", "model_hub": "huggingface", "model_size_in_billions": 3, "quantizations": ["int8", "int4", "none"], "model_id": "example/TestModel", "model_revision": "456", "model_uri": null}], "prompt_style": {"style_name": "ADD_COLON_SINGLE", "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", "roles": ["user", "assistant"], "intra_message_sep": "\\n### ", "inter_message_sep": "\\n### ", "stop": null, "stop_token_ids": null}}""" + expected = """{"version": 1, "context_length": 2048, "model_name": "TestModel", "model_lang": ["en"], "model_ability": ["embed", "generate"], "model_description": null, "model_family": null, "model_specs": [{"model_format": "ggufv2", "model_hub": "huggingface", "model_size_in_billions": 2, "quantizations": ["q4_0", "q4_1"], "quantization_parts": {"q4_2": ["a", "b"]}, "model_id": "example/TestModel", "model_revision": "123", "model_file_name_template": "TestModel.{quantization}.bin", "model_file_name_split_template": "TestModel.{quantization}.bin.{part}", "model_uri": null}, {"model_format": "pytorch", "model_hub": "huggingface", "model_size_in_billions": 3, "quantizations": ["int8", "int4", "none"], "model_id": "example/TestModel", "model_revision": "456", "model_uri": null}], "chat_template": "xyz", "stop_token_ids": [1, 2, 3], "stop": ["hello", "world"]}""" assert json.loads(llm_family.json()) == json.loads(expected) llm_family_context_length = LLMFamilyV1( @@ -170,7 +147,9 @@ def test_serialize_llm_family_v1(): model_lang=["en"], model_ability=["embed", "generate"], model_specs=[gguf_spec, pytorch_spec], - prompt_style=prompt_style, + chat_template="xyz", + stop_token_ids=[1, 2, 3], + stop=["hello", "world"], ) assert json.loads(llm_family_context_length.json()) == json.loads(expected) @@ -201,7 +180,9 @@ def test_cache_from_huggingface_pytorch(): model_lang=["en"], model_ability=["embed", "generate"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) cache_dir = cache_from_huggingface(family, spec, quantization=None) @@ -230,7 +211,9 @@ def test_cache_from_huggingface_gguf(): model_lang=["en"], model_ability=["chat"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) cache_dir = _get_cache_dir(family, spec) @@ -266,7 +249,9 @@ def test_cache_from_uri_local(): model_lang=["en"], model_ability=["embed", "chat"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) cache_dir = cache_from_uri(family, spec) @@ -295,7 +280,9 @@ def test_meta_file(): model_lang=["en"], model_ability=["embed", "generate"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) cache_dir = cache_from_huggingface(family, spec, quantization=None) @@ -340,7 +327,9 @@ def test_legacy_cache(): model_lang=["en"], model_ability=["chat"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) cache_path = get_legacy_cache_path( @@ -378,7 +367,9 @@ def test_custom_llm(): model_lang=["en"], model_ability=["chat"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) register_llm(family, False) @@ -408,7 +399,9 @@ def test_persistent_custom_llm(): model_lang=["en"], model_ability=["chat"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) register_llm(family, True) @@ -501,16 +494,6 @@ def test_skip_download_pytorch(): model_hub="modelscope", model_revision="456", ) - prompt_style = PromptStyleV1( - style_name="ADD_COLON_SINGLE", - system_prompt=( - "A chat between a curious human and an artificial intelligence assistant. The " - "assistant gives helpful, detailed, and polite answers to the human's questions." - ), - roles=["user", "assistant"], - intra_message_sep="\n### ", - inter_message_sep="\n### ", - ) llm_family = LLMFamilyV1( version=1, model_type="LLM", @@ -518,7 +501,9 @@ def test_skip_download_pytorch(): model_lang=["en"], model_ability=["embed", "generate"], model_specs=[hf_spec, ms_spec], - prompt_style=prompt_style, + chat_template="xyz", + stop_token_ids=[1, 2, 3], + stop=["hello", "world"], ) cache_dir = _get_cache_dir(llm_family, hf_spec) @@ -594,16 +579,6 @@ def test_skip_download_gguf(): model_revision="123", model_file_name_template="TestModel.{quantization}.bin", ) - prompt_style = PromptStyleV1( - style_name="ADD_COLON_SINGLE", - system_prompt=( - "A chat between a curious human and an artificial intelligence assistant. The " - "assistant gives helpful, detailed, and polite answers to the human's questions." - ), - roles=["user", "assistant"], - intra_message_sep="\n### ", - inter_message_sep="\n### ", - ) llm_family = LLMFamilyV1( version=1, model_type="LLM", @@ -611,7 +586,9 @@ def test_skip_download_gguf(): model_lang=["en"], model_ability=["embed", "generate"], model_specs=[hf_spec, ms_spec], - prompt_style=prompt_style, + chat_template="xyz", + stop_token_ids=[1, 2, 3], + stop=["hello", "world"], ) cache_dir = _get_cache_dir(llm_family, hf_spec) @@ -686,7 +663,9 @@ def test_get_cache_status_pytorch(): model_lang=["en"], model_ability=["embed", "generate"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) cache_status = get_cache_status(llm_family=family, llm_spec=spec) @@ -722,7 +701,9 @@ def test_get_cache_status_gguf(): model_lang=["en"], model_ability=["chat"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) cache_status = get_cache_status(llm_family=family, llm_spec=spec) @@ -741,13 +722,13 @@ def test_get_cache_status_gguf(): shutil.rmtree(cache_dir) -def test_parse_prompt_style(): +def test_parse_chat_template(): from ..llm_family import BUILTIN_LLM_PROMPT_STYLE assert len(BUILTIN_LLM_PROMPT_STYLE) > 0 # take some examples to assert assert "qwen-chat" in BUILTIN_LLM_PROMPT_STYLE - assert "chatglm3" in BUILTIN_LLM_PROMPT_STYLE + assert "glm4-chat" in BUILTIN_LLM_PROMPT_STYLE assert "baichuan-2-chat" in BUILTIN_LLM_PROMPT_STYLE hf_spec = LlamaCppLLMSpecV1( @@ -776,8 +757,8 @@ def test_parse_prompt_style(): model_lang=["en"], model_ability=["chat", "generate"], model_specs=[hf_spec, ms_spec], - model_family="chatglm3", - prompt_style="chatglm3", + model_family="glm4-chat", + chat_template="glm4-chat", ) model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8")) assert model_spec.model_name == llm_family.model_name @@ -791,7 +772,7 @@ def test_parse_prompt_style(): model_ability=["chat", "generate"], model_specs=[hf_spec, ms_spec], model_family="qwen-vl-chat", - prompt_style="qwen-vl-chat", + chat_template="qwen-vl-chat", ) model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf-8")) assert "vision" in model_spec.model_ability @@ -804,12 +785,12 @@ def test_parse_prompt_style(): model_lang=["en"], model_ability=["chat", "generate"], model_specs=[hf_spec, ms_spec], - prompt_style="chatglm3", + chat_template="glm4-chat", ) with pytest.raises(ValueError): CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8")) - # wrong model_family + # successful new model family llm_family = CustomLLMFamilyV1( version=1, model_type="LLM", @@ -818,12 +799,20 @@ def test_parse_prompt_style(): model_ability=["chat", "generate"], model_family="xyzz", model_specs=[hf_spec, ms_spec], - prompt_style="chatglm3", + chat_template="glm4-chat", ) - with pytest.raises(ValueError): - CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8")) + model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8")) + assert ( + model_spec.chat_template + == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["chat_template"] + ) + assert ( + model_spec.stop_token_ids + == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop_token_ids"] + ) + assert model_spec.stop == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop"] - # error: wrong prompt style + # when chat_template is None, chat_template = model_family llm_family = CustomLLMFamilyV1( version=1, model_type="LLM", @@ -831,11 +820,19 @@ def test_parse_prompt_style(): model_lang=["en"], model_ability=["chat", "generate"], model_specs=[hf_spec, ms_spec], - model_family="chatglm3", - prompt_style="test_xyz", + model_family="glm4-chat", + chat_template=None, ) - with pytest.raises(ValueError): - CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8")) + model_spec = CustomLLMFamilyV1.parse_raw(bytes(llm_family.json(), "utf8")) + assert ( + model_spec.chat_template + == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["chat_template"] + ) + assert ( + model_spec.stop_token_ids + == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop_token_ids"] + ) + assert model_spec.stop == BUILTIN_LLM_PROMPT_STYLE["glm4-chat"]["stop"] def test_match_model_size(): @@ -1073,7 +1070,9 @@ def test_query_engine_general(): model_lang=["en"], model_ability=["chat"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) register_llm(family, False) @@ -1107,15 +1106,9 @@ def test_query_engine_general(): model_lang=["en", "zh"], model_ability=["generate", "chat"], model_specs=[spec], - prompt_style={ - "style_name": "QWEN", - "system_prompt": "You are a helpful assistant.", - "roles": ["user", "assistant"], - "intra_message_sep": "\n", - "inter_message_sep": "", - "stop": ["<|endoftext|>", "<|im_start|>", "<|im_end|>"], - "stop_token_ids": [151643, 151644, 151645], - }, + chat_template="test", + stop=["<|endoftext|>", "<|im_start|>", "<|im_end|>"], + stop_token_ids=[151643, 151644, 151645], ) register_llm(family, False) diff --git a/xinference/model/llm/tests/test_multimodal.py b/xinference/model/llm/tests/test_multimodal.py index 567e0d0355..7bd3e78a15 100644 --- a/xinference/model/llm/tests/test_multimodal.py +++ b/xinference/model/llm/tests/test_multimodal.py @@ -34,16 +34,21 @@ def test_restful_api_for_qwen_vl(setup, model_format, quantization): quantization=quantization, ) model = client.get_model(model_uid) - prompt = [ - {"type": "text", "text": "What’s in this image?"}, + messages = [ { - "type": "image_url", - "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - }, - }, + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + } ] - response = model.chat(prompt=prompt) + response = model.chat(messages) assert "grass" in response["choices"][0]["message"]["content"] assert "tree" in response["choices"][0]["message"]["content"] assert "sky" in response["choices"][0]["message"]["content"] @@ -141,16 +146,21 @@ def test_restful_api_for_yi_vl(setup, model_format, quantization): quantization=quantization, ) model = client.get_model(model_uid) - prompt = [ - {"type": "text", "text": "What’s in this image?"}, + messages = [ { - "type": "image_url", - "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - }, - }, + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + } ] - response = model.chat(prompt=prompt) + response = model.chat(messages) assert "green" in response["choices"][0]["message"]["content"] assert "tree" in response["choices"][0]["message"]["content"] assert "sky" in response["choices"][0]["message"]["content"] @@ -225,16 +235,21 @@ def test_restful_api_for_deepseek_vl(setup, model_format, quantization): temperature=0.0, ) model = client.get_model(model_uid) - prompt = [ - {"type": "text", "text": "What’s in this image?"}, + messages = [ { - "type": "image_url", - "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - }, - }, + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + } ] - response = model.chat(prompt=prompt) + response = model.chat(messages) assert any( green in response["choices"][0]["message"]["content"] for green in ["grass", "green"] diff --git a/xinference/model/llm/tests/test_utils.py b/xinference/model/llm/tests/test_utils.py index 42125a0048..9d12d695ca 100644 --- a/xinference/model/llm/tests/test_utils.py +++ b/xinference/model/llm/tests/test_utils.py @@ -12,309 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ....types import ChatCompletionMessage -from ..llm_family import PromptStyleV1 -from ..utils import ChatModelMixin - - -def test_prompt_style_add_colon_single(): - prompt_style = PromptStyleV1( - style_name="ADD_COLON_SINGLE", - system_prompt=( - "A chat between a curious human and an artificial intelligence assistant. The " - "assistant gives helpful, detailed, and polite answers to the human's questions." - ), - roles=["user", "assistant"], - intra_message_sep="\n### ", - ) - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "A chat between a curious human and an artificial intelligence assistant. The assistant" - " gives helpful, detailed, and polite answers to the human's questions." - "\n### user: Hi there." - "\n### assistant: Hello, how may I help you?" - "\n### user: Write a poem." - "\n### assistant:" - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_no_colon_two(): - prompt_style = PromptStyleV1( - style_name="NO_COLON_TWO", - system_prompt="", - roles=[" ", " "], - intra_message_sep="", - inter_message_sep="", - stop_token_ids=[2, 195], - ) - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - " Hi there." - " Hello, how may I help you?" - " Write a poem." - " " - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_llama2(): - prompt_style = PromptStyleV1( - style_name="LLAMA2", - system_prompt=( - "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer" - " as helpfully as possible, while being safe. Your answers should not include any" - " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please" - " ensure that your responses are socially unbiased and positive in nature.\n\nIf a" - " question does not make any sense, or is not factually coherent, explain why instead" - " of answering something not correct. If you don't know the answer to a question," - " please don't share false information.\n<>\n\n" - ), - roles=["[INST]", "[/INST]"], - intra_message_sep=" ", - inter_message_sep=" ", - stop_token_ids=[2], - ) - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer" - " as helpfully as possible, while being safe. Your answers should not include any" - " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please" - " ensure that your responses are socially unbiased and positive in nature.\n\nIf a" - " question does not make any sense, or is not factually coherent, explain why instead" - " of answering something not correct. If you don't know the answer to a question," - " please don't share false information.\n<>\n\nHi there.[/INST] Hello, how may I help" - " you? [INST] Write a poem. [/INST]" - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_llama3(): - prompt_style = PromptStyleV1( - style_name="LLAMA3", - system_prompt=( - "You are a helpful, respectful and honest assistant. Always answer" - " as helpfully as possible, while being safe. Your answers should not include any" - " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please" - " ensure that your responses are socially unbiased and positive in nature.\n\nIf a" - " question does not make any sense, or is not factually coherent, explain why instead" - " of answering something not correct. If you don't know the answer to a question," - " please don't share false information" - ), - roles=["user", "assistant"], - intra_message_sep="\n\n", - inter_message_sep="<|eot_id|>", - stop_token_ids=[128001, 128009], - ) - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" - "You are a helpful, respectful and honest assistant. Always answer" - " as helpfully as possible, while being safe. Your answers should not include any" - " harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please" - " ensure that your responses are socially unbiased and positive in nature.\n\nIf a" - " question does not make any sense, or is not factually coherent, explain why instead" - " of answering something not correct. If you don't know the answer to a question," - " please don't share false information<|eot_id|>" - "<|start_header_id|>user<|end_header_id|>\n\nHi there.<|eot_id|>" - "<|start_header_id|>assistant<|end_header_id|>\n\nHello, how may I help you?<|eot_id|>" - "<|start_header_id|>user<|end_header_id|>\n\nWrite a poem.<|eot_id|>" - "<|start_header_id|>assistant<|end_header_id|>\n\n" - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_chatglm_v3(): - prompt_style = PromptStyleV1( - style_name="CHATGLM3", - system_prompt="", - roles=["user", "assistant"], - ) - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "<|user|>\n Hi there.\n" - "<|assistant|>\n Hello, how may I help you?\n" - "<|user|>\n Write a poem.\n" - "<|assistant|>" - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_xverse(): - prompt_style = PromptStyleV1( - style_name="XVERSE", - system_prompt="", - roles=["user", "assistant"], - ) - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "<|user|> \n Hi there." - "<|assistant|> \n Hello, how may I help you?" - "<|user|> \n Write a poem." - "<|assistant|>" - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_qwen(): - prompt_style = PromptStyleV1( - style_name="QWEN", - system_prompt="You are a helpful assistant.", - roles=["user", "assistant"], - intra_message_sep="\n", - ) - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHi there." - "<|im_end|>\n<|im_start|>assistant\nHello, how may I help you?<|im_end|>\n<|im_start|>" - "user\nWrite a poem.<|im_end|>\n<|im_start|>assistant\n" - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_chatml(): - prompt_style = PromptStyleV1( - style_name="CHATML", - system_prompt="You are a wonderful code assistant\n", - roles=["<|user|>", "<|assistant|>"], - intra_message_sep="<|end|>", - ) - - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - - expected = ( - "You are a wonderful code assistant\n" - "<|end|>\n" - "<|user|>\n" - "Hi there.<|end|>\n" - "<|assistant|>\n" - "Hello, how may I help you?<|end|>\n" - "<|user|>\n" - "Write me a HelloWorld Function<|end|>\n" - "<|assistant|>\n" - ) - assert expected == ChatModelMixin.get_prompt( - "Write me a HelloWorld Function", chat_history, prompt_style - ) - - -def test_prompt_style_add_colon_single_cot(): - prompt_style = PromptStyleV1( - style_name="ADD_COLON_SINGLE_COT", - system_prompt=( - "Below is an instruction that describes a task. Write a response that appropriately " - "completes the request." - ), - roles=["Instruction", "Response"], - intra_message_sep="\n\n### ", - ) - - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "Below is an instruction that describes a task. Write a response that appropriately " - "completes the request." - "\n\n### Instruction: Hi there." - "\n\n### Response: Hello, how may I help you?" - "\n\n### Instruction: Write a poem." - "\n\n### Response: Let's think step by step." - ) - assert expected == ChatModelMixin.get_prompt( - "Write a poem.", chat_history, prompt_style - ) - - -def test_prompt_style_zephyr(): - prompt_style = PromptStyleV1( - style_name="NO_COLON_TWO", - system_prompt=( - "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate.\n" - ), - roles=["<|user|>\n", "<|assistant|>\n"], - intra_message_sep="\n", - inter_message_sep="\n", - stop_token_ids=[2, 195], - stop=[""], - ) - - chat_history = [ - ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."), - ChatCompletionMessage( - role=prompt_style.roles[1], content="Hello, how may I help you?" - ), - ] - expected = ( - "<|system|>\n" - "You are a friendly chatbot who always responds in the style of a pirate.\n" - "<|user|>\n" - "Hi there.\n" - "<|assistant|>\n" - "Hello, how may I help you?\n" - "<|user|>\n" - "Write a poem.\n" - "<|assistant|>\n" - ) - actual = ChatModelMixin.get_prompt("Write a poem.", chat_history, prompt_style) - assert expected == actual - def test_is_valid_model_name(): from ...utils import is_valid_model_name diff --git a/xinference/model/llm/transformers/chatglm.py b/xinference/model/llm/transformers/chatglm.py index 797402b220..723664cbcf 100644 --- a/xinference/model/llm/transformers/chatglm.py +++ b/xinference/model/llm/transformers/chatglm.py @@ -11,45 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import json -import threading -import time +import typing import uuid +from threading import Thread from typing import Any, Dict, Iterator, List, Optional, Union import torch -from transformers.generation.logits_process import LogitsProcessor -from transformers.generation.utils import LogitsProcessorList from ....core.scheduler import InferenceRequest -from ....types import ( - SPECIAL_TOOL_PROMPT, - ChatCompletion, - ChatCompletionChoice, - ChatCompletionChunk, - ChatCompletionMessage, - CompletionChoice, - CompletionChunk, - CompletionUsage, - LoRA, - PytorchGenerateConfig, -) +from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import GLM4_TOOL_CALL_FAMILY +from ..utils import ( + GLM4_TOOL_CALL_FAMILY, + generate_chat_completion, + generate_completion_chunk, +) from .core import PytorchChatModel, PytorchModelConfig -class InvalidScoreLogitsProcessor(LogitsProcessor): - def __call__( - self, input_ids: torch.LongTensor, scores: torch.FloatTensor - ) -> torch.FloatTensor: - if torch.isnan(scores).any() or torch.isinf(scores).any(): - scores.zero_() - scores[..., 198] = 5e4 - return scores - - class ChatglmPytorchChatModel(PytorchChatModel): def __init__( self, @@ -107,40 +87,28 @@ def match( if llm_spec.model_format != "pytorch": return False model_family = llm_family.model_family or llm_family.model_name - if "chatglm" not in model_family and "glm4" not in model_family: + if "glm4" not in model_family: return False if "chat" not in llm_family.model_ability: return False return True - def _handle_tools(self, chat_history, generate_config) -> bool: + def _handle_tools(self, messages, generate_config): """Convert openai tools to ChatGLM tools.""" + if self.model_family.model_name not in GLM4_TOOL_CALL_FAMILY: + return None if generate_config is None: - return False + return None tools = generate_config.pop("tools", None) if tools is None: - return False - # Convert a iterable to a list + return None + # Convert an iterable to a list tools = list(tools) tool_choice = generate_config.pop("tool_choice", "none") - if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY: - chat_history[:] = self._process_messages( - chat_history, tools=tools, tool_choice=tool_choice - ) - return True - else: - chatglm_tools = [] - for elem in tools: - if elem.get("type") != "function" or "function" not in elem: - raise ValueError("ChatGLM tools only support function type.") - chatglm_tools.append(elem["function"]) - tool_prompt_message = { - "role": "system", - "content": f"Answer the following questions as best as you can. You have access to the following tools:", - "tools": chatglm_tools, - } - chat_history.insert(0, tool_prompt_message) - return True + messages[:] = self._process_messages( + messages, tools=tools, tool_choice=tool_choice + ) + return tools @staticmethod def _process_messages(messages, tools=None, tool_choice="none"): @@ -230,12 +198,70 @@ def _filter_tools(_tool_choice, _tools): return processed_messages @staticmethod - def _process_response(output, history, tools, end=False): + @typing.no_type_check + def _process_response_non_streaming( + output: str, tools: Union[Dict, List[Dict]] = None, use_tool: bool = False + ) -> Union[str, dict]: + """ + Copied from https://github.com/THUDM/GLM-4/blob/main/basic_demo/openai_api_server.py#L150 + """ + import re + + lines = output.strip().split("\n") + arguments_json = None + special_tools = ["cogview", "simple_browser"] + tools = {tool["function"]["name"] for tool in tools} if tools else {} + + # 这是一个简单的工具比较函数,不能保证拦截所有非工具输出的结果,比如参数未对齐等特殊情况。 + ##TODO 如果你希望做更多判断,可以在这里进行逻辑完善。 + + if len(lines) >= 2 and lines[1].startswith("{"): + function_name = lines[0].strip() + arguments = "\n".join(lines[1:]).strip() + if function_name in tools or function_name in special_tools: + try: + arguments_json = json.loads(arguments) + is_tool_call = True + except json.JSONDecodeError: + is_tool_call = function_name in special_tools + + if is_tool_call and use_tool: + content = { + "name": function_name, + "arguments": json.dumps( + arguments_json + if isinstance(arguments_json, dict) + else arguments, + ensure_ascii=False, + ), + } + if function_name == "simple_browser": + search_pattern = re.compile( + r'search\("(.+?)"\s*,\s*recency_days\s*=\s*(\d+)\)' + ) + match = search_pattern.match(arguments) + if match: + content["arguments"] = json.dumps( + { + "query": match.group(1), + "recency_days": int(match.group(2)), + }, + ensure_ascii=False, + ) + elif function_name == "cogview": + content["arguments"] = json.dumps( + {"prompt": arguments}, ensure_ascii=False + ) + + return content + return output.strip() + + @staticmethod + def _process_response_streaming(output, tools, end=False): # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py content = "" - history = copy.deepcopy(history) if not tools and end: - return None, None + return None for response in output.split("<|assistant|>"): if "\n" in response: metadata, content = response.split("\n", maxsplit=1) @@ -244,204 +270,53 @@ def _process_response(output, history, tools, end=False): if not metadata.strip(): if tools and any(t.startswith(response) for t in tools) and not end: # Waiting for tool call complete. - return None, None + return None content = content.strip() - history.append( - {"role": "assistant", "metadata": metadata, "content": content} - ) content = content.replace("[[训练时间]]", "2023年") else: if tools and metadata in tools and not end: - return None, None - history.append( - {"role": "assistant", "metadata": metadata, "content": content} - ) + return None metadata = metadata.strip() if tools and metadata in tools and end: try: parameters = json.loads(content) - content = {"name": metadata.strip(), "parameters": parameters} + content = {"name": metadata.strip(), "arguments": parameters} except json.JSONDecodeError: content = {"name": metadata.strip(), "content": content} else: content = {"name": metadata.strip(), "content": content} - return content, history - - def _get_generate_args( - self, - tokenizer, - query: str, - history: Optional[List[Dict]] = None, - role: str = "user", - past_key_values=None, - max_length: int = 8192, - do_sample=True, - top_p=0.8, - temperature=0.8, - logits_processor=None, - **kwargs, - ): - # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py - if history is None: - history = [] - if logits_processor is None: - logits_processor = LogitsProcessorList() - logits_processor.append(InvalidScoreLogitsProcessor()) - eos_token_id = [ - tokenizer.eos_token_id, - tokenizer.convert_tokens_to_ids("<|user|>"), - tokenizer.convert_tokens_to_ids("<|observation|>"), - ] - gen_kwargs = { - "max_length": max_length, - "do_sample": do_sample, - "top_p": top_p, - "temperature": temperature, - "logits_processor": logits_processor, - **kwargs, - } - if past_key_values is None: - inputs = tokenizer.apply_chat_template( - history + [{"role": role, "content": query}], - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - return_dict=True, - ) - else: - inputs = tokenizer.apply_chat_template( - [{"role": role, "content": query}], - add_special_tokens=False, - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - return_dict=True, - ) - inputs = inputs.to(self._model.device) - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - inputs.position_ids += past_length - attention_mask = inputs.attention_mask - attention_mask = torch.cat( - (attention_mask.new_ones(1, past_length), attention_mask), dim=1 - ) - inputs["attention_mask"] = attention_mask - history.append({"role": role, "content": query}) - tools = history[0]["role"] == "system" and history[0].get("tools") - tools = ( - [ - t.get("function", {}).get("name", "") - for t in tools - if isinstance(t, dict) - ] - if tools - else [] - ) - kwargs = dict(inputs) - kwargs["past_key_values"] = past_key_values - kwargs["eos_token_id"] = eos_token_id - kwargs.update(gen_kwargs) - return kwargs, tools + return content @torch.inference_mode() - def _stream_chat( - self, - tokenizer, - query: str, - history: Optional[List[Dict]] = None, - role: str = "user", - past_key_values=None, - max_length: int = 8192, - do_sample=True, - top_p=0.8, - temperature=0.8, - logits_processor=None, - **kwargs, - ): + def _stream_chat(self, inputs, tools, **kwargs): from transformers import TextIteratorStreamer - kwargs, tools = self._get_generate_args( - tokenizer=tokenizer, - query=query, - history=history, - role=role, - past_key_values=past_key_values, - max_length=max_length, - do_sample=do_sample, - top_p=top_p, - temperature=temperature, - logits_processor=logits_processor, - **kwargs, - ) - streamer = TextIteratorStreamer( - tokenizer, skip_prompt=True, skip_special_tokens=True + self._tokenizer, skip_prompt=True, skip_special_tokens=True ) - kwargs["streamer"] = streamer - thread = threading.Thread(target=self._model.generate, kwargs=kwargs) + tools = {tool["function"]["name"] for tool in tools} if tools else {} + generation_kwargs = dict(inputs, streamer=streamer) + generation_kwargs.update(kwargs) + thread = Thread(target=self._model.generate, kwargs=generation_kwargs) thread.start() response = "" for token in streamer: response += token if response and response[-1] != "�": - new_response, new_history = self._process_response( - response, history, tools, end=False + new_response = self._process_response_streaming( + response, tools, end=False ) if new_response is None: continue - yield new_response, new_history + yield new_response if tools: - new_response, new_history = self._process_response( - response, history, tools, end=True - ) + new_response = self._process_response_streaming(response, tools, end=True) if new_response: - yield new_response, new_history - - @torch.inference_mode() - def _non_stream_chat( - self, - tokenizer, - query: str, - history: Optional[List[Dict]] = None, - role: str = "user", - past_key_values=None, - max_length: int = 8192, - do_sample=True, - top_p=0.8, - temperature=0.8, - logits_processor=None, - **kwargs, - ): - kwargs, tools = self._get_generate_args( - tokenizer=tokenizer, - query=query, - history=history, - role=role, - past_key_values=past_key_values, - max_length=max_length, - do_sample=do_sample, - top_p=top_p, - temperature=temperature, - logits_processor=logits_processor, - **kwargs, - ) - - outputs = self._model.generate(**kwargs) - outputs = outputs[:, kwargs["input_ids"].shape[1] :] - response = tokenizer.decode(outputs[0], skip_special_tokens=True) - if tools: - return self._process_response(response, history, tools, end=True) - else: - return self._process_response(response, history, tools) + yield new_response - def chat( - self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, - generate_config: Optional[PytorchGenerateConfig] = None, - ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + @staticmethod + def _get_generate_kwargs(generate_config): kwargs: Dict[str, Any] = {} generate_config = generate_config or {} temperature = generate_config.get("temperature") @@ -453,18 +328,26 @@ def chat( max_new_tokens = generate_config.get("max_tokens") if max_new_tokens is not None: kwargs["max_new_tokens"] = int(max_new_tokens) - chat_history = chat_history or [] - tools = self._handle_tools(chat_history, generate_config) - # Tool calls only works for non stream, so we call chat directly. - if prompt == SPECIAL_TOOL_PROMPT and chat_history: - tool_message = chat_history.pop() - content = tool_message.get("content") - assert content is not None - prompt = content - kwargs["role"] = "observation" - chat_history = [h for h in chat_history if not h.get("tool_calls")] - if system_prompt: - chat_history.append({"role": "system", "content": system_prompt}) + do_sample = generate_config.get("do_sample") + if do_sample is not None: + kwargs["do_sample"] = bool(do_sample) + top_k = generate_config.get("top_k") + if top_k is not None: + kwargs["top_k"] = top_k + repetition_penalty = generate_config.get("repetition_penalty") + if repetition_penalty is not None: + kwargs["repetition_penalty"] = repetition_penalty + return kwargs + + def chat( + self, + messages: List[Dict], + generate_config: Optional[PytorchGenerateConfig] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + generate_config = generate_config or {} + kwargs: Dict[str, Any] = self._get_generate_kwargs(generate_config) + tools = self._handle_tools(messages, generate_config) + has_tools = tools is not None stream = generate_config.get("stream", False) stream_options = generate_config.pop("stream_options", None) include_usage = ( @@ -472,103 +355,82 @@ def chat( if isinstance(stream_options, dict) else False ) - if stream and ( - not tools or self.model_family.model_name in GLM4_TOOL_CALL_FAMILY - ): + inputs = self._tokenizer.apply_chat_template( + messages, + return_tensors="pt", + chat_template=self.model_family.chat_template, + add_generation_prompt=True, + return_dict=True, + ) + inputs = inputs.to(self._model.device) + + if not stream: + with torch.no_grad(): + outputs = self._model.generate(**inputs, **kwargs) + outputs = outputs[:, inputs["input_ids"].shape[1] :] + response = self._tokenizer.decode(outputs[0], skip_special_tokens=True) + # In some cases, the response starts with `\n` + if response.startswith("\n"): + response = response[1:] + if has_tools: + function_call = self._process_response_non_streaming( + response, tools, use_tool=True + ) + return self._tool_calls_completion( + self.model_family, self.model_uid, function_call + ) + else: + return generate_chat_completion(self.model_uid, response) + else: def _stream_generator(): last_chunk_text_length = 0 chunk_id = "chat-" + str(uuid.uuid1()) prompt_tokens, completion_tokens, total_tokens = 0, 0, 0 - inputs = self._tokenizer([prompt], return_tensors="pt") - inputs = inputs.to(self._model.device) prompt_tokens = len(inputs["input_ids"][0]) - for chunk_text, _ in self._stream_chat( - self._tokenizer, prompt, chat_history, **kwargs - ): + for chunk_text in self._stream_chat(inputs, tools, **kwargs): if tools and isinstance(chunk_text, dict): yield self._tool_calls_completion_chunk( - self.model_family, self.model_uid, [chunk_text, _], tools + self.model_family, self.model_uid, chunk_text ) return completion_tokens = completion_tokens + 1 total_tokens = prompt_tokens + completion_tokens chunk_text = chunk_text[last_chunk_text_length:] last_chunk_text_length += len(chunk_text) - completion_choice = CompletionChoice( - text=chunk_text, index=0, logprobs=None, finish_reason=None - ) - yield CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - usage=CompletionUsage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=total_tokens, - ), + yield generate_completion_chunk( + chunk_text, + finish_reason=None, + chunk_id=chunk_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, ) - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + None, + finish_reason="stop", + chunk_id=chunk_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=True, + has_content=False, ) - chunk["usage"] = completion_usage - yield chunk if include_usage: - chunk = CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[], - ) - chunk["usage"] = CompletionUsage( + yield generate_completion_chunk( + None, + finish_reason=None, + chunk_id=chunk_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=False, ) - yield chunk return self._to_chat_completion_chunks(_stream_generator()) - else: - response = self._non_stream_chat( - self._tokenizer, prompt, chat_history, **kwargs - ) - if tools: - return self._tool_calls_completion( - self.model_family, self.model_uid, response, tools - ) - else: - content, _ = response - return ChatCompletion( - id="chat" + str(uuid.uuid1()), - object="chat.completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - ChatCompletionChoice( - index=0, - message={"role": "assistant", "content": content}, - finish_reason="stop", - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) def prepare_sanitize_generate_config(self, req: InferenceRequest): """ diff --git a/xinference/model/llm/transformers/cogvlm2.py b/xinference/model/llm/transformers/cogvlm2.py index 79b15be69c..f3c27454d9 100644 --- a/xinference/model/llm/transformers/cogvlm2.py +++ b/xinference/model/llm/transformers/cogvlm2.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import time import uuid from concurrent.futures import ThreadPoolExecutor from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -21,17 +20,14 @@ from ....core.scheduler import InferenceRequest from ....model.utils import select_device -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import _decode_image +from ..utils import ( + _decode_image, + generate_chat_completion, + generate_completion_chunk, + parse_messages, +) from .core import PytorchChatModel, PytorchGenerateConfig from .utils import get_max_src_len @@ -139,9 +135,7 @@ def _message_content_to_cogvlm2(self, content): ) return content, None - def _history_content_to_cogvlm2( - self, system_prompt: str, chat_history: List[ChatCompletionMessage] - ): + def _history_content_to_cogvlm2(self, system_prompt: str, chat_history: List[Dict]): query = system_prompt history: List[Tuple] = [] pixel_values = None @@ -163,7 +157,7 @@ def get_query_and_history( self, prompt: Union[str, List[Dict]], system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + chat_history: Optional[List[Dict]] = None, ): content, image = self._message_content_to_cogvlm2(prompt) @@ -184,12 +178,12 @@ def get_query_and_history( def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - system_prompt = system_prompt if system_prompt else "" + system_prompt = "" + if messages[0]["role"] == "system": + system_prompt = messages[0]["content"] stream = generate_config.get("stream", False) if generate_config else False sanitized_config = { @@ -199,6 +193,7 @@ def chat( else 512, } + prompt, _, chat_history = parse_messages(messages) query, image, history = self.get_query_and_history( prompt, system_prompt=system_prompt, chat_history=chat_history ) @@ -236,21 +231,7 @@ def chat( response = self._tokenizer.decode(outputs[0]) response = response.split("<|end_of_text|>")[0] - chunk = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=response, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return self._to_chat_completion(chunk) + return generate_chat_completion(self.model_uid, response) def _streaming_chat_response( self, inputs: Dict, config: Dict @@ -277,36 +258,26 @@ def _streaming_chat_response( completion_id = str(uuid.uuid1()) for new_text in streamer: - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=new_text, finish_reason=None, logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, ) - yield chunk - - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + has_choice=True, + has_content=False, ) - yield chunk @staticmethod def build_position_ids(x, attention_mask=None): @@ -341,7 +312,9 @@ def build_position_ids(x, attention_mask=None): def get_dtype(self): return self._torch_type - def _get_full_prompt(self, prompt, system_prompt, chat_history, tools): + def _get_full_prompt(self, messages: List[Dict], tools): + prompt, system_prompt, chat_history = parse_messages(messages) + system_prompt = system_prompt or "" query, image, history = self.get_query_and_history( prompt, system_prompt=system_prompt, chat_history=chat_history ) diff --git a/xinference/model/llm/transformers/cogvlm2_video.py b/xinference/model/llm/transformers/cogvlm2_video.py index 24f31e0b5c..f39119f7aa 100644 --- a/xinference/model/llm/transformers/cogvlm2_video.py +++ b/xinference/model/llm/transformers/cogvlm2_video.py @@ -12,28 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import time import uuid from concurrent.futures import ThreadPoolExecutor from typing import Dict, Iterator, List, Optional, Tuple, Union import torch -from ....core.scheduler import InferenceRequest from ....model.utils import select_device -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import _decode_image +from ..utils import ( + _decode_image, + generate_chat_completion, + generate_completion_chunk, + parse_messages, +) from .core import PytorchChatModel, PytorchGenerateConfig -from .utils import get_max_src_len logger = logging.getLogger(__name__) @@ -170,9 +164,7 @@ def _message_content_to_cogvlm2(self, content): return text, images, video return content, [], None - def _history_content_to_cogvlm2( - self, system_prompt: str, chat_history: List[ChatCompletionMessage] - ): + def _history_content_to_cogvlm2(self, system_prompt: str, chat_history: List[Dict]): query = system_prompt history: List[Tuple] = [] pixel_values = None @@ -202,7 +194,7 @@ def get_query_and_history( self, prompt: Union[str, List[Dict]], system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + chat_history: Optional[List[Dict]] = None, ): content, image, video = self._message_content_to_cogvlm2(prompt) @@ -237,12 +229,12 @@ def get_query_and_history( def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - system_prompt = system_prompt if system_prompt else "" + system_prompt = "" + if messages[0]["role"] == "system": + system_prompt = messages[0]["content"] stream = generate_config.get("stream", False) if generate_config else False sanitized_config = { @@ -252,6 +244,7 @@ def chat( else 512, } + prompt, _, chat_history = parse_messages(messages) query, image, video, history = self.get_query_and_history( prompt, system_prompt=system_prompt, chat_history=chat_history ) @@ -292,21 +285,7 @@ def chat( response = self._tokenizer.decode(outputs[0]) response = response.split("<|end_of_text|>")[0] - chunk = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=response, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return self._to_chat_completion(chunk) + return generate_chat_completion(self.model_uid, response) def _streaming_chat_response( self, inputs: Dict, config: Dict @@ -333,192 +312,23 @@ def _streaming_chat_response( completion_id = str(uuid.uuid1()) for new_text in streamer: - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=new_text, finish_reason=None, logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, ) - yield chunk - - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - yield chunk - - @staticmethod - def build_position_ids(x, attention_mask=None): - """ - Copied from https://huggingface.co/THUDM/cogvlm2-llama3-chinese-chat-19B-int4/blob/main/modeling_cogvlm.py - """ - # Fix: 参考官方开源代码 - if attention_mask is not None: - tmp = x.clone() - tmp[~(attention_mask.bool())] = -1 - else: - tmp = x.clone() - # image boi eoi token as LANGUAGE_TOKEN_TYPE - is_boi_eoi = torch.zeros_like(x, dtype=torch.bool) - is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & ( - tmp[:, :-1] == LANGUAGE_TOKEN_TYPE - ) - is_boi_eoi[:, 0] |= tmp[:, 0] == VISION_TOKEN_TYPE - is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & ( - tmp[:, 1:] == LANGUAGE_TOKEN_TYPE + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + has_choice=True, + has_content=False, ) - is_boi_eoi[:, -1] |= tmp[:, -1] == VISION_TOKEN_TYPE - tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE - # final position ids - y = torch.zeros_like(x, dtype=torch.long) - y[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | ( - (tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE) - ) - y = y.cumsum(dim=-1) - return y - - def get_dtype(self): - return self._torch_type - - def _get_full_prompt(self, prompt, system_prompt, chat_history, tools): - query, image, video, history = self.get_query_and_history( - prompt, system_prompt=system_prompt, chat_history=chat_history - ) - - if video: - image = [video] - - input_by_model: dict = self._model.build_conversation_input_ids( # type: ignore - self._tokenizer, - query=query, - history=history, - images=image, - template_version="chat", - ) - return { - "input_ids": input_by_model["input_ids"], # seq_len - "token_type_ids": input_by_model["token_type_ids"], # seq_len - "attention_mask": input_by_model["attention_mask"], # seq_len - "images": input_by_model["images"], - } - - def prepare_sanitize_generate_config(self, req: InferenceRequest): - """ - See https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B/blob/main/generation_config.json - """ - raw_config = req.inference_kwargs.get("raw_params", {}) - temperature = raw_config.get("temperature", None) - if temperature is None: - raw_config["temperature"] = 0.6 - top_p = raw_config.get("top_p", None) - if top_p is None: - raw_config["top_p"] = 0.9 - return raw_config - - def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]): - context_len = self.get_context_len() - assert isinstance(prompts[0], dict) - images = [] - max_length = float("-inf") - for i, feature in enumerate(prompts): - req = req_list[i] - if "images" in feature: - images.append(feature.pop("images", None)) - max_src_len = get_max_src_len(context_len, req) - input_ids = feature["input_ids"][-max_src_len:] - req.prompt_tokens = input_ids.tolist() - feature["input_ids"] = input_ids - feature["token_type_ids"] = feature["token_type_ids"][-max_src_len:] - feature["attention_mask"] = feature["attention_mask"][-max_src_len:] - req.extra_kwargs["attention_mask_seq_len"] = feature[ - "attention_mask" - ].shape[0] - max_length = max(len(input_ids), max_length) - - def pad_to_max_length_internal(feature, max_len, idx): - padding_length = max_len - len(feature["input_ids"]) - req_list[idx].padding_len = padding_length - feature["input_ids"] = torch.cat( - [torch.full((padding_length,), 0), feature["input_ids"]] - ) - feature["token_type_ids"] = torch.cat( - [ - torch.zeros(padding_length, dtype=torch.long), - feature["token_type_ids"], - ] - ) - feature["attention_mask"] = torch.cat( - [ - torch.zeros(padding_length, dtype=torch.long), - feature["attention_mask"], - ] - ) - return feature - - features = [ - pad_to_max_length_internal(feature, max_length, i) - for i, feature in enumerate(prompts) - ] - batch = { - key: torch.stack([feature[key] for feature in features]) - for key in features[0].keys() - } - - position_ids = self.build_position_ids(batch["token_type_ids"]) - batch["position_ids"] = position_ids - - for i in range(len(prompts)): - req = req_list[i] - req.extra_kwargs["max_position_id"] = position_ids[i : i + 1, -1].item() - - if images: - batch["images"] = images - - batch = recur_move_to( - batch, self._device, lambda x: isinstance(x, torch.Tensor) - ) - dtype = self.get_dtype() - if dtype: - batch = recur_move_to( - batch, - dtype, - lambda x: isinstance(x, torch.Tensor) and torch.is_floating_point(x), - ) - return batch - - def build_decode_token_type_ids( - self, batch_size: int, seq_length: int, reqs: List[InferenceRequest] - ): - token_type_ids = torch.full( - (batch_size, 1), fill_value=1, dtype=torch.long, device=self._device - ) - return token_type_ids - - def build_decode_position_ids( - self, batch_size: int, seq_length: int, reqs: List[InferenceRequest] - ): - tmp = [] - for r in reqs: - r.extra_kwargs["max_position_id"] += 1 - tmp.append(r.extra_kwargs["max_position_id"]) - position_ids = torch.as_tensor( - tmp, device=self._device, dtype=torch.long - ).unsqueeze(1) - return position_ids diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index b02f88e947..fd7d75b22e 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -16,7 +16,7 @@ import logging import os from functools import lru_cache -from typing import Iterable, Iterator, List, Optional, Tuple, Union +from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union import torch @@ -29,7 +29,6 @@ from ....types import ( ChatCompletion, ChatCompletionChunk, - ChatCompletionMessage, Completion, CompletionChoice, CompletionChunk, @@ -52,8 +51,6 @@ "chatglm3-128k", "glm4-chat", "glm4-chat-1m", - "llama-2", - "llama-2-chat", "internlm2-chat", "internlm2.5-chat", "qwen-vl-chat", @@ -615,12 +612,13 @@ def prepare_batch_inference(self, req_list: List[InferenceRequest]): r.error_msg = str(e) def get_builtin_stop_token_ids(self) -> Tuple: - return ( - tuple(self.model_family.prompt_style.stop_token_ids) - if self.model_family.prompt_style - and self.model_family.prompt_style.stop_token_ids - else tuple() - ) + from ..utils import get_stop_token_ids_from_config_file + + stop_token_ids = get_stop_token_ids_from_config_file(self.model_path) + if stop_token_ids is not None: + return tuple(stop_token_ids) + else: + return tuple(self.model_family.stop_token_ids) def handle_batch_inference_results(self, req_list: List[InferenceRequest]): for req in req_list: @@ -693,20 +691,13 @@ def _sanitize_generate_config( generate_config: Optional[PytorchGenerateConfig], ) -> PytorchGenerateConfig: generate_config = super()._sanitize_generate_config(generate_config) - if ( - (not generate_config.get("stop")) - and self.model_family.prompt_style - and self.model_family.prompt_style.stop - ): - generate_config["stop"] = self.model_family.prompt_style.stop.copy() + if (not generate_config.get("stop")) and self.model_family.stop is not None: + generate_config["stop"] = self.model_family.stop.copy() if ( generate_config.get("stop_token_ids", None) is None - and self.model_family.prompt_style - and self.model_family.prompt_style.stop_token_ids + and self.model_family.stop_token_ids is not None ): - generate_config[ - "stop_token_ids" - ] = self.model_family.prompt_style.stop_token_ids.copy() + generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy() return generate_config @@ -725,26 +716,22 @@ def match( def chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: tools = generate_config.pop("tools", []) if generate_config else None - full_prompt = self._get_full_prompt(prompt, system_prompt, chat_history, tools) - - generate_config = self._sanitize_generate_config(generate_config) - # TODO(codingl2k1): qwen hacky to set stop for function call. model_family = self.model_family.model_family or self.model_family.model_name + full_context_kwargs = {} if tools and model_family in QWEN_TOOL_CALL_FAMILY: - stop = generate_config.get("stop") - if isinstance(stop, str): - generate_config["stop"] = [stop, "Observation:"] - elif isinstance(stop, Iterable): - assert not isinstance(stop, str) - generate_config["stop"] = list(stop) + ["Observation:"] - else: - generate_config["stop"] = "Observation:" + full_context_kwargs["tools"] = tools + full_prompt = self.get_full_context( + messages, + self.model_family.chat_template, + tokenizer=self._tokenizer, + **full_context_kwargs, + ) + + generate_config = self._sanitize_generate_config(generate_config) stream = generate_config.get("stream", False) if stream: @@ -755,22 +742,15 @@ def chat( c = self.generate(full_prompt, generate_config) assert not isinstance(c, Iterator) if tools: - return self._tool_calls_completion( - self.model_family, self.model_uid, c, tools - ) + return self._tool_calls_completion(self.model_family, self.model_uid, c) return self._to_chat_completion(c) def load(self): super().load() - def _get_full_prompt(self, prompt, system_prompt, chat_history, tools): - assert self.model_family.prompt_style is not None - prompt_style = self.model_family.prompt_style.copy() - if system_prompt: - prompt_style.system_prompt = system_prompt - chat_history = chat_history or [] - full_prompt = ChatModelMixin.get_prompt( - prompt, chat_history, prompt_style, tools=tools + def _get_full_prompt(self, messages: List[Dict], tools): + full_prompt = self.get_full_context( + messages, self.model_family.chat_template, tokenizer=self._tokenizer ) return full_prompt @@ -779,9 +759,7 @@ def prepare_batch_inference(self, req_list: List[InferenceRequest]): for r in req_list: try: if not r.stopped and r.is_prefill: - r.full_prompt = self._get_full_prompt( - r.prompt, r.system_prompt, r.chat_history, None - ) + r.full_prompt = self._get_full_prompt(r.prompt, None) except Exception as e: logger.exception(f"prepare inference error with {e}") r.stopped = True diff --git a/xinference/model/llm/transformers/deepseek_vl.py b/xinference/model/llm/transformers/deepseek_vl.py index d24158f5d4..cfec06b7d8 100644 --- a/xinference/model/llm/transformers/deepseek_vl.py +++ b/xinference/model/llm/transformers/deepseek_vl.py @@ -15,7 +15,6 @@ import logging import os.path import tempfile -import time import uuid from concurrent.futures import ThreadPoolExecutor from io import BytesIO @@ -25,16 +24,9 @@ import torch from ....model.utils import select_device -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import generate_chat_completion, generate_completion_chunk from .core import PytorchChatModel, PytorchGenerateConfig logger = logging.getLogger(__name__) @@ -147,9 +139,7 @@ def _fill_placeholder(_url, _index): def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: if not generate_config: @@ -162,44 +152,40 @@ def chat( if isinstance(stream_options, dict) else False ) - prompt, images = self._message_content_to_deepseek(prompt) - prompt_messages: List[Dict[str, Any]] = [ - { - "role": "User", - "content": prompt, - }, - {"role": "Assistant", "content": ""}, - ] - if images: - prompt_messages[0]["images"] = images - - # Convert openai history to qwen vl history - deepseek_history = [] - for h in chat_history or []: - role = h["role"] + + prompt = "" + deepseek_messages = [] + for i, message in enumerate(messages): + role = message["role"] + content = message["content"] if role == "user": - content, images = self._message_content_to_deepseek(h["content"]) - msg: Dict[str, Any] = { - "role": "User", - "content": content, - } - if images: - msg["images"] = images - deepseek_history.append(msg) + if isinstance(content, str): + deepseek_messages.append({"role": "User", "content": content}) + else: + content, images = self._message_content_to_deepseek(content) + msg: Dict[str, Any] = { + "role": "User", + "content": content, + } + if images: + msg["images"] = images + deepseek_messages.append(msg) + if i == len(messages) - 1: + prompt = content elif role == "assistant": - deepseek_history.append({"role": "Assistant", "content": h["content"]}) + deepseek_messages.append({"role": "Assistant", "content": content}) else: - logger.error("Unexpected msg in chat history: %s", h) - - deepseek_history.extend(prompt_messages) + logger.error( + f"Unexpected message in messages: role: {role}, message: {message}" + ) from ....thirdparty.deepseek_vl.serve.inference import generate from ....thirdparty.deepseek_vl.utils.io import load_pil_images # load images and prepare for inputs - pil_images = load_pil_images(deepseek_history) + pil_images = load_pil_images(deepseek_messages) prepare_inputs = self._vl_chat_processor( - conversations=deepseek_history, images=pil_images, force_batchify=True + conversations=deepseek_messages, images=pil_images, force_batchify=True ).to(self._model.device, self._model.dtype) temperature = generate_config.get("temperature", 0.2) @@ -226,31 +212,16 @@ def chat( it = self._generate_stream(streamer, stop_str, include_usage, prompt) return self._to_chat_completion_chunks(it) else: - c = self._generate(streamer, stop_str) - return self._to_chat_completion(c) + return self._generate(streamer, stop_str) - def _generate(self, streamer, stop_str) -> Completion: + def _generate(self, streamer, stop_str) -> ChatCompletion: generated_text = "" for new_text in streamer: if new_text.endswith(stop_str): new_text = new_text[: -len(stop_str)] generated_text += new_text - c = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=generated_text, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return c + return generate_chat_completion(self.model_uid, generated_text) def _generate_stream( self, streamer, stop_str, include_usage, prompt @@ -262,54 +233,40 @@ def _generate_stream( for i, new_text in enumerate(streamer): if new_text.endswith(stop_str): new_text = new_text[: -len(stop_str)] - completion_choice = CompletionChoice( - text=new_text, index=0, logprobs=None, finish_reason=None - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) completion_tokens = i total_tokens = prompt_tokens + completion_tokens - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=True, + has_content=True, ) - chunk["usage"] = completion_usage - yield chunk - - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=True, + has_content=False, ) - chunk["usage"] = completion_usage - yield chunk + if include_usage: - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[], - ) - chunk["usage"] = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=False, + has_content=False, ) - yield chunk diff --git a/xinference/model/llm/transformers/glm4v.py b/xinference/model/llm/transformers/glm4v.py index 4df4f9cd4d..c16a167688 100644 --- a/xinference/model/llm/transformers/glm4v.py +++ b/xinference/model/llm/transformers/glm4v.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import time import typing import uuid from concurrent.futures import ThreadPoolExecutor @@ -22,18 +21,10 @@ import torch from ....core.scheduler import InferenceRequest -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ...utils import select_device from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import _decode_image +from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk from .core import PytorchChatModel, PytorchGenerateConfig from .utils import get_max_src_len @@ -102,66 +93,45 @@ def load(self): self._tokenizer = tokenizer self._save_tensorizer() - def _message_content_to_chat(self, content): - if not isinstance(content, str): - texts = [] - image_urls = [] - for c in content: - c_type = c.get("type") - if c_type == "text": - texts.append(c["text"]) - elif c_type == "image_url": - image_urls.append(c["image_url"]["url"]) - image_futures = [] - with ThreadPoolExecutor() as executor: - for image_url in image_urls: - fut = executor.submit(_decode_image, image_url) - image_futures.append(fut) - images = [fut.result() for fut in image_futures] - text = " ".join(texts) - if len(images) == 0: - return text, [] - elif len(images) == 1: - return text, images + @staticmethod + def _get_processed_msgs(messages: List[Dict]) -> List[Dict]: + res = [] + for message in messages: + role = message["role"] + content = message["content"] + if isinstance(content, str): + res.append({"role": role, "content": content}) else: - raise RuntimeError("Only one image per message is supported") - return content, [] - - def _get_chat_msgs( - self, - prompt: Union[str, List[Dict]], - chat_history: Optional[List[ChatCompletionMessage]] = None, - ): - content, images_chat = self._message_content_to_chat(prompt) - - msgs = [] - query_to_response: List[Dict] = [] - images_history = [] - for h in chat_history or []: - role = h["role"] - content_h, images_tmp = self._message_content_to_chat(h["content"]) - if images_tmp: - images_history = images_tmp - if len(query_to_response) == 0 and role == "user": - query_to_response.append({"role": "user", "content": content_h}) - if len(query_to_response) == 1 and role == "assistant": - query_to_response.append({"role": "assistant", "content": content_h}) - if len(query_to_response) == 2: - msgs.extend(query_to_response) - query_to_response = [] - image = None - if len(images_chat) > 0: - image = images_chat[0] - elif len(images_history) > 0: - image = images_history[0] - msgs.append({"role": "user", "content": content, "image": image}) - return msgs + texts = [] + image_urls = [] + for c in content: + c_type = c.get("type") + if c_type == "text": + texts.append(c["text"]) + else: + assert ( + c_type == "image_url" + ), "Please follow the image input of the OpenAI API." + image_urls.append(c["image_url"]["url"]) + if len(image_urls) > 1: + raise RuntimeError("Only one image per message is supported") + image_futures = [] + with ThreadPoolExecutor() as executor: + for image_url in image_urls: + fut = executor.submit(_decode_image, image_url) + image_futures.append(fut) + images = [fut.result() for fut in image_futures] + assert len(images) <= 1 + text = " ".join(texts) + if images: + res.append({"role": role, "content": text, "image": images[0]}) + else: + res.append({"role": role, "content": text}) + return res def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: from transformers import TextIteratorStreamer @@ -170,7 +140,7 @@ def chat( generate_config = {} stream = generate_config.get("stream", False) - msgs = self._get_chat_msgs(prompt, chat_history) + msgs = self._get_processed_msgs(messages) inputs = self._tokenizer.apply_chat_template( msgs, @@ -213,64 +183,38 @@ def chat( response = self._tokenizer.decode(outputs[0]) if response.endswith(stop_str): response = response[: -len(stop_str)] - c = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=response, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return self._to_chat_completion(c) + return generate_chat_completion(self.model_uid, response) def chat_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]: completion_id = str(uuid.uuid1()) for new_text in streamer: if not new_text.endswith(stop_str): - completion_choice = CompletionChoice( - text=new_text, index=0, logprobs=None, finish_reason=None - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=-1, completion_tokens=-1, total_tokens=-1, + has_choice=True, + has_content=True, ) - chunk["usage"] = completion_usage - yield chunk - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=-1, completion_tokens=-1, total_tokens=-1, + has_choice=True, + has_content=False, ) - chunk["usage"] = completion_usage - yield chunk - def _get_full_prompt(self, prompt, system_prompt, chat_history, tools): - msgs = self._get_chat_msgs(prompt, chat_history) + def _get_full_prompt(self, messages, tools): + msgs = self._get_processed_msgs(messages) inputs = self._tokenizer.apply_chat_template( msgs, add_generation_prompt=True, diff --git a/xinference/model/llm/transformers/intern_vl.py b/xinference/model/llm/transformers/intern_vl.py index 02632e2af8..242d4d27ac 100644 --- a/xinference/model/llm/transformers/intern_vl.py +++ b/xinference/model/llm/transformers/intern_vl.py @@ -12,24 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import time import uuid from concurrent.futures import ThreadPoolExecutor from typing import Dict, Iterator, List, Optional, Union import torch -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import _decode_image +from ..utils import ( + _decode_image, + generate_chat_completion, + generate_completion_chunk, + parse_messages, +) from .core import PytorchChatModel, PytorchGenerateConfig logger = logging.getLogger(__name__) @@ -78,7 +74,7 @@ def _message_content_to_intern(content, image_cnt): def _get_prompt_and_chat_history( prompt: Union[str, List[Dict]], - chat_history: Optional[List[ChatCompletionMessage]] = None, + chat_history: Optional[List[Dict]] = None, ): # Convert openai history to intern vl history images = [] @@ -332,9 +328,7 @@ def load(self, **kwargs): def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: from ....thirdparty.internvl.conversation import get_conv_template @@ -366,6 +360,7 @@ def chat( else False ) + prompt, _, chat_history = parse_messages(messages) content, history, images, videos = _get_prompt_and_chat_history( prompt, chat_history ) @@ -434,10 +429,9 @@ def chat( chunk = self._generate_stream(generate_kwargs, input_ids, include_usage) return self._to_chat_completion_chunks(chunk) else: - chunk = self._generate(generate_kwargs, input_ids, template) - return self._to_chat_completion(chunk) + return self._generate(generate_kwargs, input_ids, template) - def _generate(self, generate_kwargs, input_ids, template): + def _generate(self, generate_kwargs, input_ids, template) -> ChatCompletion: prompt_tokens = len(input_ids[0]) generation_output = self._model.generate(**generate_kwargs) completion_tokens = len(generation_output[0]) @@ -445,23 +439,13 @@ def _generate(self, generate_kwargs, input_ids, template): generation_output, skip_special_tokens=True )[0] response = response.split(template.sep)[0].strip() - chunk = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=response, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ), + return generate_chat_completion( + self.model_uid, + response, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, ) - return chunk def _generate_stream(self, generate_kwargs, input_ids, include_usage): from threading import Thread @@ -483,58 +467,43 @@ def _generate_stream(self, generate_kwargs, input_ids, include_usage): completion_id = str(uuid.uuid1()) prompt_tokens = len(input_ids[0]) - completion_tokens = 0 + total_tokens, completion_tokens = 0, 0 # Loop through the streamer to get the new text as it is generated for i, new_text in enumerate(streamer): if new_text == self._model.conv_template.sep: break - completion_choice = CompletionChoice( - text=new_text, index=0, logprobs=None, finish_reason=None - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) completion_tokens = max(completion_tokens, len(streamer.token_cache)) total_tokens = prompt_tokens + completion_tokens - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, ) - chunk["usage"] = completion_usage - yield chunk - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=True, + has_content=False, ) - chunk["usage"] = completion_usage - yield chunk + if include_usage: - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[], - ) - chunk["usage"] = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=False, + has_content=False, ) - yield chunk diff --git a/xinference/model/llm/transformers/internlm2.py b/xinference/model/llm/transformers/internlm2.py index fc7b1c7588..fa046be8de 100644 --- a/xinference/model/llm/transformers/internlm2.py +++ b/xinference/model/llm/transformers/internlm2.py @@ -11,23 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import time import uuid from typing import Any, Dict, Iterator, List, Optional, Union from ....core.scheduler import InferenceRequest -from ....types import ( - ChatCompletion, - ChatCompletionChoice, - ChatCompletionChunk, - ChatCompletionMessage, - CompletionChoice, - CompletionChunk, - CompletionUsage, - LoRA, - PytorchGenerateConfig, -) +from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import generate_chat_completion, generate_completion_chunk, parse_messages from .core import PytorchChatModel, PytorchModelConfig @@ -106,9 +96,7 @@ def prepare_sanitize_generate_config(self, req: InferenceRequest): def chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: kwargs: Dict[str, Any] = {} @@ -130,6 +118,8 @@ def chat( if isinstance(stream_options, dict) else False ) + + prompt, system_prompt, chat_history = parse_messages(messages) if chat_history: input_history = [ (chat_history[i]["content"], (chat_history[i + 1]["content"])) @@ -155,54 +145,42 @@ def _stream_generator(): total_tokens = prompt_tokens + completion_tokens chunk_text = chunk_text[last_chunk_text_length:] last_chunk_text_length += len(chunk_text) - completion_choice = CompletionChoice( - text=chunk_text, index=0, logprobs=None, finish_reason=None - ) - yield CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - usage=CompletionUsage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=total_tokens, - ), + + yield generate_completion_chunk( + chunk_text, + finish_reason=None, + chunk_id=chunk_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, ) + yield generate_completion_chunk( + None, + finish_reason="stop", + chunk_id=chunk_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + has_choice=True, + has_content=False, + ) if include_usage: - chunk = CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[], - ) - chunk["usage"] = CompletionUsage( + yield generate_completion_chunk( + None, + finish_reason=None, + chunk_id=chunk_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=False, ) - yield chunk return self._to_chat_completion_chunks(_stream_generator()) else: response, _ = self._model.chat( self._tokenizer, prompt, input_history, **kwargs ) - return ChatCompletion( - id="chat" + str(uuid.uuid1()), - object="chat.completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - ChatCompletionChoice( - index=0, - message={"role": "assistant", "content": response}, - finish_reason="stop", - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) + return generate_chat_completion(self.model_uid, response) diff --git a/xinference/model/llm/transformers/llama_2.py b/xinference/model/llm/transformers/llama_2.py deleted file mode 100644 index 4e5e01d263..0000000000 --- a/xinference/model/llm/transformers/llama_2.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2022-2023 XProbe Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Optional - -from ....types import LoRA -from ..llm_family import LLMFamilyV1, LLMSpecV1 -from .core import PytorchChatModel, PytorchModel, PytorchModelConfig - - -class LlamaPytorchModel(PytorchModel): - def __init__( - self, - model_uid: str, - model_family: "LLMFamilyV1", - model_spec: "LLMSpecV1", - quantization: str, - model_path: str, - pytorch_model_config: Optional[PytorchModelConfig] = None, - peft_model: Optional[List[LoRA]] = None, - ): - super().__init__( - model_uid, - model_family, - model_spec, - quantization, - model_path, - pytorch_model_config=pytorch_model_config, - peft_model=peft_model, - ) - - def _load_model(self, **kwargs): - model, tokenizer = super()._load_model(**kwargs) - # Llama has no pad token by default - # https://github.com/huggingface/transformers/blob/07998ef39926b76d3f6667025535d0859eed61c3/docs/source/en/llm_tutorial.md?plain=1#L125 - tokenizer.pad_token = tokenizer.eos_token - model.config.eos_token_id = tokenizer.eos_token_id - model.config.pad_token_id = tokenizer.pad_token_id - return model, tokenizer - - @classmethod - def match( - cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - if llm_spec.model_format != "pytorch": - return False - model_family = llm_family.model_family or llm_family.model_name - if "llama-2" not in model_family: - return False - if "generate" not in llm_family.model_ability: - return False - return True - - -class LlamaPytorchChatModel(PytorchChatModel): - def __init__( - self, - model_uid: str, - model_family: "LLMFamilyV1", - model_spec: "LLMSpecV1", - quantization: str, - model_path: str, - pytorch_model_config: Optional["PytorchModelConfig"] = None, - peft_model: Optional[List[LoRA]] = None, - ): - super().__init__( - model_uid, - model_family, - model_spec, - quantization, - model_path, - peft_model=peft_model, - pytorch_model_config=pytorch_model_config, - ) - self._use_fast_tokenizer = False - - def _load_model(self, **kwargs): - model, tokenizer = super()._load_model(**kwargs) - # Llama has no pad token by default - # https://github.com/huggingface/transformers/blob/07998ef39926b76d3f6667025535d0859eed61c3/docs/source/en/llm_tutorial.md?plain=1#L125 - tokenizer.pad_token = tokenizer.eos_token - model.config.eos_token_id = tokenizer.eos_token_id - model.config.pad_token_id = tokenizer.pad_token_id - return model, tokenizer - - @classmethod - def match( - cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - if llm_spec.model_format != "pytorch": - return False - model_family = llm_family.model_family or llm_family.model_name - if "llama-2" not in model_family: - return False - if "chat" not in llm_family.model_ability: - return False - return True diff --git a/xinference/model/llm/transformers/minicpmv25.py b/xinference/model/llm/transformers/minicpmv25.py index af22319759..41b100d867 100644 --- a/xinference/model/llm/transformers/minicpmv25.py +++ b/xinference/model/llm/transformers/minicpmv25.py @@ -13,25 +13,21 @@ # limitations under the License. import json import logging -import time import uuid from concurrent.futures import ThreadPoolExecutor from typing import Dict, Iterator, List, Optional, Union import torch -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ...utils import select_device from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import _decode_image +from ..utils import ( + _decode_image, + generate_chat_completion, + generate_completion_chunk, + parse_messages, +) from .core import PytorchChatModel, PytorchGenerateConfig logger = logging.getLogger(__name__) @@ -125,12 +121,11 @@ def _message_content_to_chat(self, content): def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: stream = generate_config.get("stream", False) if generate_config else False + prompt, _, chat_history = parse_messages(messages) content, images_chat = self._message_content_to_chat(prompt) msgs = [] @@ -166,57 +161,29 @@ def chat( it = self.chat_stream(chat) return self._to_chat_completion_chunks(it) else: - c = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=chat, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return self._to_chat_completion(c) + return generate_chat_completion(self.model_uid, chat) def chat_stream(self, chat) -> Iterator[CompletionChunk]: completion_id = str(uuid.uuid1()) for new_text in chat: - completion_choice = CompletionChoice( - text=new_text, index=0, logprobs=None, finish_reason=None - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=-1, completion_tokens=-1, total_tokens=-1, ) - chunk["usage"] = completion_usage - yield chunk - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=-1, completion_tokens=-1, total_tokens=-1, + has_choice=True, + has_content=False, ) - chunk["usage"] = completion_usage - yield chunk diff --git a/xinference/model/llm/transformers/minicpmv26.py b/xinference/model/llm/transformers/minicpmv26.py index 0900bc4a86..7e97bca4f0 100644 --- a/xinference/model/llm/transformers/minicpmv26.py +++ b/xinference/model/llm/transformers/minicpmv26.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import time import uuid from concurrent.futures import ThreadPoolExecutor from typing import Dict, Iterator, List, Optional, Union @@ -20,18 +19,15 @@ import torch from PIL import Image -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ...utils import select_device from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import _decode_image +from ..utils import ( + _decode_image, + generate_chat_completion, + generate_completion_chunk, + parse_messages, +) from .core import PytorchChatModel, PytorchGenerateConfig logger = logging.getLogger(__name__) @@ -160,13 +156,12 @@ def _load_video(_url): def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: stream = generate_config.get("stream", False) if generate_config else False videoExisted = False + prompt, _, chat_history = parse_messages(messages) content, images_chat, video_frames = self._message_content_to_chat(prompt) if len(video_frames) > 0: @@ -216,57 +211,28 @@ def chat( it = self.chat_stream(chat) return self._to_chat_completion_chunks(it) else: - c = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=chat, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return self._to_chat_completion(c) + return generate_chat_completion(self.model_uid, chat) def chat_stream(self, chat) -> Iterator[CompletionChunk]: completion_id = str(uuid.uuid1()) for new_text in chat: - completion_choice = CompletionChoice( - text=new_text, index=0, logprobs=None, finish_reason=None - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=-1, completion_tokens=-1, total_tokens=-1, ) - chunk["usage"] = completion_usage - yield chunk - - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=-1, completion_tokens=-1, total_tokens=-1, + has_choice=True, + has_content=False, ) - chunk["usage"] = completion_usage - yield chunk diff --git a/xinference/model/llm/transformers/omnilmm.py b/xinference/model/llm/transformers/omnilmm.py index 583f3cc56e..3ddffda0a4 100644 --- a/xinference/model/llm/transformers/omnilmm.py +++ b/xinference/model/llm/transformers/omnilmm.py @@ -16,20 +16,13 @@ import logging import operator import tempfile -import time -import uuid from typing import Dict, Iterator, List, Optional, Tuple, Union from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64 -from ....types import ( - ChatCompletion, - ChatCompletionChoice, - ChatCompletionChunk, - ChatCompletionMessage, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk from ...utils import select_device from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import generate_chat_completion, parse_messages from .core import PytorchChatModel, PytorchGenerateConfig logger = logging.getLogger(__name__) @@ -96,15 +89,14 @@ def _ensure_url(_url): def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: if generate_config and generate_config.get("stream"): raise Exception( f"Chat with model {self.model_family.model_name} does not support stream." ) + prompt, _, chat_history = parse_messages(messages) image_first, prompt = self._message_content_to_OmniLMM(prompt) msgs = [] @@ -135,19 +127,4 @@ def chat( input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)} answer = self._model.chat(input=input) - return ChatCompletion( - id="chat" + str(uuid.uuid1()), - object="chat.completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - ChatCompletionChoice( - index=0, - message={"role": "assistant", "content": answer}, - finish_reason="stop", - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) + return generate_chat_completion(self.model_uid, answer) diff --git a/xinference/model/llm/transformers/qwen_vl.py b/xinference/model/llm/transformers/qwen_vl.py index 8a2be562e3..e7db57334c 100644 --- a/xinference/model/llm/transformers/qwen_vl.py +++ b/xinference/model/llm/transformers/qwen_vl.py @@ -15,7 +15,6 @@ import logging import operator import tempfile -import time import typing import uuid from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -25,16 +24,9 @@ from ....core.scheduler import InferenceRequest from ....model.utils import select_device -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import generate_chat_completion, generate_completion_chunk from .core import PytorchChatModel, PytorchGenerateConfig from .utils import pad_prefill_tokens @@ -129,18 +121,12 @@ def _ensure_url(_url): return self._tokenizer.from_list_format(content) return content - def _get_prompt_and_chat_history( - self, - prompt: Union[str, List[Dict]], - chat_history: Optional[List[ChatCompletionMessage]] = None, - ): - prompt = self._message_content_to_qwen(prompt) - # Convert openai history to qwen vl history + def _get_prompt_and_chat_history(self, messages: List[Dict]): qwen_history = [] query_to_response: List = [] - for h in chat_history or []: - role = h["role"] - content = self._message_content_to_qwen(h["content"]) + for message in messages[:-1]: + role = message["role"] + content = self._message_content_to_qwen(message["content"]) if len(query_to_response) == 0 and role == "user": query_to_response.append(content) if len(query_to_response) == 1 and role == "assistant": @@ -148,18 +134,15 @@ def _get_prompt_and_chat_history( if len(query_to_response) == 2: qwen_history.append(query_to_response) query_to_response = [] + prompt = self._message_content_to_qwen(messages[-1]["content"]) return prompt, qwen_history def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - prompt, qwen_history = self._get_prompt_and_chat_history( - prompt, chat_history=chat_history - ) + prompt, qwen_history = self._get_prompt_and_chat_history(messages) stream = generate_config.get("stream", False) if generate_config else False stream_options = ( @@ -174,33 +157,17 @@ def chat( it = self._generate_stream(prompt, qwen_history, include_usage) # type: ignore return self._to_chat_completion_chunks(it) else: - c = self._generate(prompt, qwen_history) # type: ignore - return self._to_chat_completion(c) + return self._generate(prompt, qwen_history) # type: ignore - def _generate(self, prompt: str, qwen_history: List) -> Completion: + def _generate(self, prompt: str, qwen_history: List) -> ChatCompletion: response, history = self._model.chat( self._tokenizer, query=prompt, history=qwen_history ) - c = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=response, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return c + return generate_chat_completion(self.model_uid, response) def _generate_stream( self, prompt: str, qwen_history: List, include_usage ) -> Iterator[CompletionChunk]: - # response, history = model.chat(tokenizer, message, history=history) response_generator = self._model.chat_stream( self._tokenizer, query=prompt, history=qwen_history ) @@ -212,57 +179,40 @@ def _generate_stream( for response in response_generator: inc_content = response[len(full_response) :] full_response = response - completion_choice = CompletionChoice( - text=inc_content, index=0, logprobs=None, finish_reason=None - ) - completion_chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) completion_tokens = completion_tokens + 1 total_tokens = prompt_tokens + completion_tokens - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=inc_content, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, ) - completion_chunk["usage"] = completion_usage - yield completion_chunk - - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - completion_chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=True, + has_content=False, ) - completion_chunk["usage"] = completion_usage - yield completion_chunk if include_usage: - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[], - ) - chunk["usage"] = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=False, + has_content=False, ) - yield chunk @staticmethod def get_batch_size_and_seq_len_indexes_from_kv() -> Tuple[int, int]: @@ -359,10 +309,8 @@ def _tokenize_str(role, content): return raw_text, context_tokens - def _get_full_prompt(self, prompt, system_prompt, chat_history, tools): - prompt, qwen_history = self._get_prompt_and_chat_history( - prompt, chat_history=chat_history - ) + def _get_full_prompt(self, messages: List[Dict], tools): + prompt, qwen_history = self._get_prompt_and_chat_history(messages) _, context_tokens = self.make_context(self._tokenizer, prompt, qwen_history) return context_tokens diff --git a/xinference/model/llm/transformers/tests/test_tensorizer.py b/xinference/model/llm/transformers/tests/test_tensorizer.py index a4e228259c..87fd38a7a7 100644 --- a/xinference/model/llm/transformers/tests/test_tensorizer.py +++ b/xinference/model/llm/transformers/tests/test_tensorizer.py @@ -37,7 +37,9 @@ def setup_and_teardown(self): model_lang=["en", "zh"], model_ability=["chat", "tools"], model_specs=[spec], - prompt_style=None, + chat_template=None, + stop_token_ids=None, + stop=None, ) if not os.path.exists(self.model_path): diff --git a/xinference/model/llm/transformers/utils.py b/xinference/model/llm/transformers/utils.py index 5ada9a512c..d34112d24f 100644 --- a/xinference/model/llm/transformers/utils.py +++ b/xinference/model/llm/transformers/utils.py @@ -321,7 +321,7 @@ def generate_stream( if stream: completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason=finish_reason + index=0, logprobs=None, finish_reason=finish_reason ) else: completion_choice = CompletionChoice( @@ -430,39 +430,6 @@ def pad_prefill_tokens( return prompt_tokens -def _get_completion_chunk( - output: str, - chunk_id: str, - finish_reason: Optional[str], - model_uid: str, - r: InferenceRequest, - just_usage: bool, -): - completion_choice = ( - [ - CompletionChoice( - text=output, index=0, logprobs=None, finish_reason=finish_reason - ) - ] - if not just_usage - else [] - ) - completion_chunk = CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=model_uid, - choices=completion_choice, - ) - completion_usage = CompletionUsage( - prompt_tokens=len(r.prompt_tokens), - completion_tokens=len(r.new_tokens), - total_tokens=len(r.prompt_tokens) + len(r.new_tokens), - ) - completion_chunk["usage"] = completion_usage - return completion_chunk - - def _get_completion( output: str, chunk_id: str, @@ -551,6 +518,8 @@ def _batch_inference_one_step_internal( bos_flag: str = "", eos_flag: str = "", ): + from ..utils import generate_completion_chunk + # need to judge stopped here, # since some requests state may change to stopped due to invalid parameters, e.g. max_src_len valid_req_list = [r for r in req_list if not r.stopped] @@ -710,11 +679,30 @@ def _batch_inference_one_step_internal( output = output[r.last_output_length :] r.last_output_length += len(output) - completion_chunk = _get_completion_chunk( - output, r.chunk_id, r.finish_reason, model_uid, r, False + completion_chunk = generate_completion_chunk( + chunk_text=output, + finish_reason=None, + chunk_id=r.chunk_id, + model_uid=model_uid, + prompt_tokens=len(r.prompt_tokens), + completion_tokens=len(r.new_tokens), + total_tokens=len(r.prompt_tokens) + len(r.new_tokens), ) r.completion.append(completion_chunk) if r.stopped: + # OpenAI compatible chunk + completion_chunk = generate_completion_chunk( + chunk_text=None, + finish_reason=r.finish_reason, + chunk_id=r.chunk_id, + model_uid=model_uid, + prompt_tokens=len(r.prompt_tokens), + completion_tokens=len(r.new_tokens), + total_tokens=len(r.prompt_tokens) + len(r.new_tokens), + has_choice=True, + has_content=False, + ) + r.completion.append(completion_chunk) r.completion.append(eos_flag) # last round, handle stream result @@ -723,8 +711,16 @@ def _batch_inference_one_step_internal( # these tokens are real generated and should be counted. if r.stopped and _i == decode_round - 1 and include_usage: r.completion.append( - _get_completion_chunk( - "", r.chunk_id, r.finish_reason, model_uid, r, True + generate_completion_chunk( + chunk_text=None, + finish_reason=None, + chunk_id=r.chunk_id, + model_uid=model_uid, + prompt_tokens=len(r.prompt_tokens), + completion_tokens=len(r.new_tokens), + total_tokens=len(r.prompt_tokens) + len(r.new_tokens), + has_choice=False, + has_content=False, ) ) else: diff --git a/xinference/model/llm/transformers/yi_vl.py b/xinference/model/llm/transformers/yi_vl.py index e4b3d1f6ce..9cfa87a536 100644 --- a/xinference/model/llm/transformers/yi_vl.py +++ b/xinference/model/llm/transformers/yi_vl.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import time import uuid from concurrent.futures import ThreadPoolExecutor from threading import Thread @@ -21,17 +20,14 @@ import torch from ....model.utils import select_device -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk from ..llm_family import LLMFamilyV1, LLMSpecV1 -from ..utils import _decode_image +from ..utils import ( + _decode_image, + generate_chat_completion, + generate_completion_chunk, + parse_messages, +) from .core import PytorchChatModel, PytorchGenerateConfig logger = logging.getLogger(__name__) @@ -105,15 +101,11 @@ def _message_content_to_yi(content) -> Union[str, tuple]: def chat( self, - prompt: Union[str, List[Dict]], - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: from transformers import TextIteratorStreamer - # TODO(codingl2k1): implement stream mode. - if not generate_config: generate_config = {} @@ -134,7 +126,8 @@ def chat( # Convert chat history to llava state state = conv_templates["mm_default"].copy() - for message in chat_history or []: + prompt, _, chat_history = parse_messages(messages) + for message in chat_history: content = self._message_content_to_yi(message["content"]) state.append_message(message["role"], content) state.append_message(state.roles[0], self._message_content_to_yi(prompt)) @@ -190,31 +183,15 @@ def chat( it = self._generate_stream(streamer, stop_str, input_ids, include_usage) return self._to_chat_completion_chunks(it) else: - c = self._generate(streamer, stop_str) - return self._to_chat_completion(c) + return self._generate(streamer, stop_str) - def _generate(self, streamer, stop_str) -> Completion: + def _generate(self, streamer, stop_str) -> ChatCompletion: generated_text = "" for new_text in streamer: generated_text += new_text if generated_text.endswith(stop_str): generated_text = generated_text[: -len(stop_str)] - - c = Completion( - id=str(uuid.uuid1()), - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[ - CompletionChoice( - index=0, text=generated_text, finish_reason="stop", logprobs=None - ) - ], - usage=CompletionUsage( - prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 - ), - ) - return c + return generate_chat_completion(self.model_uid, generated_text) def _generate_stream( self, streamer, stop_str, input_ids, include_usage @@ -224,54 +201,37 @@ def _generate_stream( prompt_tokens = len(input_ids[0]) for i, new_text in enumerate(streamer): if not new_text.endswith(stop_str): - completion_choice = CompletionChoice( - text=new_text, index=0, logprobs=None, finish_reason=None - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) completion_tokens = i total_tokens = prompt_tokens + completion_tokens - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, ) - chunk["usage"] = completion_usage - yield chunk - - completion_choice = CompletionChoice( - text="", index=0, logprobs=None, finish_reason="stop" - ) - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=True, + has_content=False, ) - chunk["usage"] = completion_usage - yield chunk if include_usage: - chunk = CompletionChunk( - id=completion_id, - object="text_completion", - created=int(time.time()), - model=self.model_uid, - choices=[], - ) - chunk["usage"] = CompletionUsage( + yield generate_completion_chunk( + chunk_text=None, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + has_choice=False, + has_content=False, ) - yield chunk diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index 8107d890a0..974671720e 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -17,6 +17,7 @@ import logging import os import time +import typing import uuid from io import BytesIO from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast @@ -25,19 +26,19 @@ from PIL import Image from ...types import ( - SPECIAL_TOOL_PROMPT, ChatCompletion, + ChatCompletionChoice, ChatCompletionChunk, - ChatCompletionMessage, Completion, + CompletionChoice, CompletionChunk, + CompletionUsage, ) from ..utils import ensure_cache_cleared from .llm_family import ( LlamaCppLLMSpecV1, LLMFamilyV1, LLMSpecV1, - PromptStyleV1, _get_cache_dir, get_cache_status, ) @@ -46,7 +47,6 @@ QWEN_TOOL_CALL_FAMILY = [ - "qwen-chat", "qwen1.5-chat", "qwen1.5-moe-chat", "qwen2-instruct", @@ -58,416 +58,90 @@ "glm4-chat-1m", ] +QWEN_TOOL_CALL_SYMBOLS = ["", ""] + class ChatModelMixin: @staticmethod - def get_prompt( - prompt: str, - chat_history: List[ChatCompletionMessage], - prompt_style: PromptStyleV1, - tools: Optional[List[Dict]] = None, - ): + @functools.lru_cache + def _compile_jinja_template(chat_template): """ - Inspired by FastChat. Format chat history into a prompt according to the prompty style of - different models. + Copied from transformers source code. """ - assert prompt_style.roles is not None - if prompt != SPECIAL_TOOL_PROMPT: - chat_history.append( - ChatCompletionMessage(role=prompt_style.roles[0], content=prompt) - ) - chat_history.append( - ChatCompletionMessage(role=prompt_style.roles[1], content="") + try: + from jinja2.exceptions import TemplateError + from jinja2.sandbox import ImmutableSandboxedEnvironment + except ImportError: + raise ImportError("xinference requires jinja2 to be installed.") + + def raise_exception(message): + raise TemplateError(message) + + jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True) + jinja_env.globals["raise_exception"] = raise_exception + return jinja_env.from_string(chat_template) + + def _build_from_raw_template( + self, messages: List, chat_template: str, **kwargs + ) -> str: + compiled_template = self._compile_jinja_template(chat_template) + rendered = compiled_template.render( + messages=messages, add_generation_prompt=True, **kwargs ) - - def get_role(role_name: str): - if role_name == "user": - return prompt_style.roles[0] - elif role_name == "assistant": - return prompt_style.roles[1] - else: - return role_name - - if prompt_style.style_name == "ADD_COLON_SINGLE": - ret = prompt_style.system_prompt + prompt_style.intra_message_sep - for message in chat_history: - role = get_role(message["role"]) - content = message["content"] - if content: - ret += role + ": " + content + prompt_style.intra_message_sep - else: - ret += role + ":" - return ret - elif prompt_style.style_name == "NO_COLON_TWO": - seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep] - ret = prompt_style.system_prompt - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - ret += role + content + seps[i % 2] - else: - ret += role - return ret - elif prompt_style.style_name == "LLAMA2": - seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep] - ret = "" - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - if i == 0: - ret += prompt_style.system_prompt + content - else: - ret += role + " " + content + seps[i % 2] - else: - ret += role - return ret - elif prompt_style.style_name == "LLAMA3": - ret = ( - f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>" - f"{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}" - ) - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - ret += ( - f"<|start_header_id|>{role}<|end_header_id|>" - f"{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}" - ) - else: - ret += f"<|start_header_id|>{role}<|end_header_id|>{prompt_style.intra_message_sep}" - return ret - elif prompt_style.style_name == "MIXTRAL_V01": - ret = "" - for i, message in enumerate(chat_history): - content = message["content"] - if i % 2 == 0: # user - ret += f" [INST] {content} [/INST]" - else: # assistant - ret += f"{content} " - return ret - elif prompt_style.style_name == "CHATGLM3": - prompts = ( - [f"<|system|>\n {prompt_style.system_prompt}"] - if prompt_style.system_prompt - else [] - ) - - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message.get("content") - tool_calls = message.get("tool_calls") - if tool_calls: - content = tool_calls[0]["function"] - if content: - if role == "tool": - role = "observation" - prompts.append(f"<|{role}|>\n {content}") - else: - prompts.append(f"<|{role}|>") - return "\n".join(prompts) - elif prompt_style.style_name == "XVERSE": - ret = ( - f"<|system|> \n {prompt_style.system_prompt}" - if prompt_style.system_prompt - else "" - ) - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - ret += f"<|{role}|> \n {content}" - else: - ret += f"<|{role}|>" - return ret - elif prompt_style.style_name == "QWEN": - if tools: - tool_desc = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters} Format the arguments as a JSON object.""" - - react_instruction = """Answer the following questions as best you can. You have access to the following APIs: - -{tools_text} - -Use the following format: - -Question: the input question you must answer -Thought: you should always think about what to do -Action: the action to take, should be one of [{tools_name_text}] -Action Input: the input to the action -Observation: the result of the action -... (this Thought/Action/Action Input/Observation can be repeated zero or more times) -Thought: I now know the final answer -Final Answer: the final answer to the original input question - -Begin!""" - tools_text = [] - tools_name_text = [] - for func_info in tools: - parameters = [] - fp = func_info["function"].get("parameters", {}) - if fp: - required_parameters = fp.get("required", []) - for name, p in fp["properties"].items(): - param = dict({"name": name}, **p) - if name in required_parameters: - param["required"] = True - parameters.append(param) - - name = func_info["function"]["name"] - desc = func_info["function"]["description"] - tool_string = tool_desc.format( - name_for_model=name, - name_for_human=name, - # Hint: You can add the following format requirements in description: - # "Format the arguments as a JSON object." - # "Enclose the code within triple backticks (`) at the beginning and end of the code." - description_for_model=desc, - parameters=json.dumps(parameters, ensure_ascii=False), - ) - tools_text.append(tool_string) - tools_name_text.append(name) - tools_text_string = "\n\n".join(tools_text) - tools_name_text_string = ", ".join(tools_name_text) - tool_system = react_instruction.format( - tools_text=tools_text_string, - tools_name_text=tools_name_text_string, + return rendered + + def get_full_context( + self, messages: List, chat_template: str, tokenizer=None, **kwargs + ) -> str: + if tokenizer is not None: + try: + full_context = tokenizer.apply_chat_template( + messages, + tokenize=False, + chat_template=chat_template, + add_generation_prompt=True, + **kwargs, ) - else: - tool_system = "" - - ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>" - for message in chat_history: - role = get_role(message["role"]) - content = message.get("content") - - ret += prompt_style.intra_message_sep - if tools: - if role == "user": - if tool_system: - content = tool_system + f"\n\nQuestion: {content}" - tool_system = "" - else: - content = f"Question: {content}" - elif role == "assistant": - tool_calls = message.get("tool_calls") - if tool_calls: - func_call = tool_calls[0]["function"] - f_name, f_args = ( - func_call["name"], - func_call["arguments"], - ) - content = f"Thought: I can use {f_name}.\nAction: {f_name}\nAction Input: {f_args}" - elif content: - content = f"Thought: I now know the final answer.\nFinal answer: {content}" - elif role == "tool": - role = "function" - content = f"Observation: {content}" - else: - raise Exception(f"Unsupported message role: {role}") - if content: - content = content.lstrip("\n").rstrip() - ret += f"<|im_start|>{role}\n{content}<|im_end|>" - else: - ret += f"<|im_start|>{role}\n" - return ret - elif prompt_style.style_name == "CHATML": - ret = ( - "" - if prompt_style.system_prompt == "" - else prompt_style.system_prompt + prompt_style.intra_message_sep + "\n" - ) - for message in chat_history: - role = get_role(message["role"]) - content = message["content"] + return full_context + except Exception as e: + logger.warning( + f"tokenizer.apply_chat_template error. Maybe this is an old model: {e}" + ) + return self._build_from_raw_template(messages, chat_template, **kwargs) + else: + # build from jinja + # Compilation function uses a cache to avoid recompiling the same template + return self._build_from_raw_template(messages, chat_template, **kwargs) - if content: - ret += role + "\n" + content + prompt_style.intra_message_sep + "\n" - else: - ret += role + "\n" - return ret - elif prompt_style.style_name == "INTERNLM2": - ret = ( - "" - if prompt_style.system_prompt == "" - else "<|im_start|>system\n" - + prompt_style.system_prompt - + prompt_style.intra_message_sep - + "\n" - ) - for message in chat_history: - role = get_role(message["role"]) - content = message["content"] + @staticmethod + def get_specific_prompt(model_family: str, messages: List[Dict]): + """ + Inspired by FastChat. Format chat history into a prompt according to the prompty style of + different models. + """ + _messages = [x for x in messages] # copy for not modifying the origin messages + _messages.append({"role": "assistant", "content": ""}) - if content: - ret += role + "\n" + content + prompt_style.intra_message_sep + "\n" - else: - ret += role + "\n" - return ret - elif prompt_style.style_name == "ADD_COLON_SINGLE_COT": - ret = prompt_style.system_prompt + prompt_style.intra_message_sep - for message in chat_history: - role = get_role(message["role"]) - content = message["content"] - if content: - ret += role + ": " + content + prompt_style.intra_message_sep - else: - ret += role + ": Let's think step by step." - return ret - elif prompt_style.style_name == "DEEPSEEK_CHAT": - seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep] - ret = prompt_style.system_prompt - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - ret += role + ": " + content + seps[i % 2] - else: - ret += role + ":" - return ret - elif prompt_style.style_name == "DEEPSEEK_CODER": - sep = prompt_style.inter_message_sep - ret = prompt_style.system_prompt + sep - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - ret += role + "\n" + content + sep - else: - ret += role + "\n" - return ret - elif prompt_style.style_name == "GORILLA_OPENFUNCTIONS": - if tools: - gorilla_functions = [] - for tool in tools: - gorilla_functions.append( - { - "name": tool["function"]["name"], - "api_name": tool["function"]["name"], - "description": tool["function"]["description"], - "parameters": [ - dict({"name": name}, **p) - for name, p in tool["function"]["parameters"][ - "properties" - ].items() - ], - } - ) - tools_string = json.dumps(gorilla_functions) - return f"USER: <> {prompt} <> {tools_string}\nASSISTANT: " - else: - return f"USER: <> {prompt}\nASSISTANT: " - elif prompt_style.style_name == "orion": - ret = "" - for i, message in enumerate(chat_history): - content = message["content"] - role = get_role(message["role"]) - if i % 2 == 0: # Human - assert content is not None - ret += role + ": " + content + "\n\n" - else: # Assistant - if content: - ret += role + ": " + content + "" - else: - ret += role + ": " - return ret - elif prompt_style.style_name == "gemma": - ret = "" - for message in chat_history: - content = message["content"] - role = get_role(message["role"]) - ret += "" + role + "\n" - if content: - ret += content + "\n" - return ret - elif prompt_style.style_name == "CodeShell": - ret = "" - for message in chat_history: - content = message["content"] - role = get_role(message["role"]) - if content: - ret += f"{role}{content}||" - else: - ret += f"{role}".rstrip() - return ret - elif prompt_style.style_name == "MINICPM-2B": - ret = "" - for message in chat_history: - content = message["content"] or "" - role = get_role(message["role"]) - if role == "user": - ret += "<用户>" + content.strip() - else: - ret += "" + content.strip() - return ret - elif prompt_style.style_name == "PHI3": - ret = f"<|system|>{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}" - for message in chat_history: - content = message["content"] or "" - role = get_role(message["role"]) - if content: - ret += f"<|{role}|>{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}" - else: - ret += f"<|{role}|>{prompt_style.intra_message_sep}" - ret += "<|assistant|>\n" - return ret - elif prompt_style.style_name == "c4ai-command-r": - ret = ( - f"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" - f"{prompt_style.system_prompt}{prompt_style.inter_message_sep}" + if model_family == "internvl2": + system_prompt = ( + messages[0]["content"] if messages[0]["role"] == "system" else "" ) - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - ret += f"{role}{content}{prompt_style.inter_message_sep}" - else: - ret += role - return ret - elif prompt_style.style_name == "mistral-nemo": - seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep] - ret = "" - for i, message in enumerate(chat_history): - role = get_role(message["role"]) - content = message["content"] - if content: - if i == len(chat_history) - 2 and prompt_style.system_prompt: - ret += ( - role - + " " - + prompt_style.system_prompt - + "\n\n" - + content - + seps[i % 2] - ) - else: - ret += role + " " + content + seps[i % 2] - else: - ret += role - return ret - elif prompt_style.style_name == "INTERNVL": + intra_message_sep = "<|im_end|>" ret = ( "" - if prompt_style.system_prompt == "" + if system_prompt == "" else "<|im_start|>system\n" - + prompt_style.system_prompt - + prompt_style.intra_message_sep + + system_prompt + + intra_message_sep + "\n" ) images = [] # type: ignore - for message in chat_history: - role = get_role(message["role"]) + for message in _messages: + role = "<|im_start|>" + message["role"] content = message["content"] if isinstance(content, str): if content: - ret += ( - role - + "\n" - + content - + prompt_style.intra_message_sep - + "\n" - ) + ret += role + "\n" + content + intra_message_sep + "\n" else: ret += role + "\n" elif isinstance(content, list): @@ -488,21 +162,15 @@ def get_role(role_name: str): image_futures.append(fut) images = [fut.result() for fut in image_futures] if len(image_futures) == 0: - ret += ( - role + "\n" + text + prompt_style.intra_message_sep + "\n" - ) + ret += role + "\n" + text + intra_message_sep + "\n" else: ret += ( - role - + "\n" - + f"\n{text}" - + prompt_style.intra_message_sep - + "\n" + role + "\n" + f"\n{text}" + intra_message_sep + "\n" ) - return (ret, images) + return ret, images else: - raise ValueError(f"Invalid prompt style: {prompt_style.style_name}") + raise ValueError(f"Invalid model family: {model_family}") @classmethod def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk: @@ -523,7 +191,7 @@ def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChun { "index": i, "delta": { - "content": choice.get("text"), + **({"content": choice["text"]} if "text" in choice else {}), **( {"tool_calls": choice["tool_calls"]} if "tool_calls" in choice @@ -632,83 +300,39 @@ def _to_chat_completion(completion: Completion) -> ChatCompletion: } @staticmethod - def _eval_gorilla_openfunctions_arguments(c, tools): - tool_names = [tool["function"]["name"] for tool in tools] - arguments = c["choices"][0]["text"] - - def tool_call(n, **kwargs): - return None, n, kwargs - + def _eval_glm_chat_arguments(c): try: - a, b, c = eval( - arguments, {n: functools.partial(tool_call, n) for n in tool_names} - ) - return a, b, c - except Exception as e: - logger.error("Eval tool calls completion failed: %s", e) - return arguments, None, None - - @staticmethod - def _eval_glm_chat_arguments(c, tools): - try: - if isinstance(c[0], str): - return c[0], None, None - return None, c[0]["name"], c[0]["parameters"] + if isinstance(c, dict): + return None, c["name"], c["arguments"] except KeyError: logger.error("Can't parse glm output: %s", c) return str(c), None, None + else: + return str(c), None, None @staticmethod - def _eval_qwen_chat_arguments(c, tools): + def _eval_qwen_chat_arguments(c): text = c["choices"][0]["text"] + text: str = text.strip() + if text.startswith(QWEN_TOOL_CALL_SYMBOLS[0]): + text = text[len(QWEN_TOOL_CALL_SYMBOLS[0]) :] + if text.endswith(QWEN_TOOL_CALL_SYMBOLS[1]): + text = text[: -len(QWEN_TOOL_CALL_SYMBOLS[1])] + text = text.strip() try: - # Refer to: - # https://github.com/QwenLM/Qwen/blob/main/examples/react_prompt.md - # https://github.com/QwenLM/Qwen/blob/main/openai_api.py#L297 - func_name, func_args, content = "", "", "" - i = text.rfind("\nAction:") - j = text.rfind("\nAction Input:") - k = text.rfind("\nObservation:") - t = max( - text.rfind("\nThought:", 0, i), text.rfind("Thought:", 0, i) - ) # find the last thought just before Action, considering the Thought at the very beginning - if 0 <= i < j: # If the text has `Action` and `Action input`, - if k < j: # but does not contain `Observation`, - # then it is likely that `Observation` is omitted by the LLM, - # because the output text may have discarded the stop word. - text = text.rstrip() + "\nObservation:" # Add it back. - k = text.rfind("\nObservation:") - if 0 <= t < i < j < k: - func_name = text[i + len("\nAction:") : j].strip() - func_args = text[j + len("\nAction Input:") : k].strip() - content = text[ - t + len("\nThought:") : i - ].strip() # len("\nThought:") and len("Thought:") both are OK since there is a space after : - if func_name: - return content, func_name, json.loads(func_args) + content = json.loads(text) + return None, content["name"], content["arguments"] except Exception as e: - logger.error("Eval tool calls completion failed: %s", e) - t = max(text.rfind("\nThought:"), text.rfind("Thought:")) - z = max(text.rfind("\nFinal Answer:"), text.rfind("Final Answer:")) - if z >= 0: - text = text[ - z + len("\nFinal Answer:") : - ] # len("\nFinal Answer::") and len("Final Answer::") both are OK since there is a space after : - else: - text = text[ - t + len("\nThought:") : - ] # There is only Thought: no Final Answer: - return text, None, None + logger.error("Can't parse qwen tool call output: %s. Error: %s", text, e) + return text, None, None @classmethod - def _eval_tool_arguments(cls, model_family, c, tools): + def _eval_tool_arguments(cls, model_family, c): family = model_family.model_family or model_family.model_name - if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]: - content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools) - elif family in GLM4_TOOL_CALL_FAMILY: - content, func, args = cls._eval_glm_chat_arguments(c, tools) + if family in GLM4_TOOL_CALL_FAMILY: + content, func, args = cls._eval_glm_chat_arguments(c) elif family in QWEN_TOOL_CALL_FAMILY: - content, func, args = cls._eval_qwen_chat_arguments(c, tools) + content, func, args = cls._eval_qwen_chat_arguments(c) else: raise Exception( f"Model {model_family.model_name} is not support tool calls." @@ -747,9 +371,9 @@ def process_tokens(tokens: str, delta: str): return lambda tokens, delta: delta @classmethod - def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools): + def _tool_calls_completion_chunk(cls, model_family, model_uid, c): _id = str(uuid.uuid4()) - content, func, args = cls._eval_tool_arguments(model_family, c, tools) + content, func, args = cls._eval_tool_arguments(model_family, c) if func: d = { "role": "assistant", @@ -760,7 +384,7 @@ def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools): "type": "function", "function": { "name": func, - "arguments": json.dumps(args), + "arguments": json.dumps(args, ensure_ascii=False), }, } ], @@ -795,9 +419,9 @@ def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools): } @classmethod - def _tool_calls_completion(cls, model_family, model_uid, c, tools): + def _tool_calls_completion(cls, model_family, model_uid, c): _id = str(uuid.uuid4()) - content, func, args = cls._eval_tool_arguments(model_family, c, tools) + content, func, args = cls._eval_tool_arguments(model_family, c) if func: m = { "role": "assistant", @@ -808,7 +432,7 @@ def _tool_calls_completion(cls, model_family, model_uid, c, tools): "type": "function", "function": { "name": func, - "arguments": json.dumps(args), + "arguments": json.dumps(args, ensure_ascii=False), }, } ], @@ -841,16 +465,6 @@ def _tool_calls_completion(cls, model_family, model_uid, c, tools): "usage": usage, } - @classmethod - def get_full_prompt(cls, model_family, prompt, system_prompt, chat_history, tools): - assert model_family.prompt_style is not None - prompt_style = model_family.prompt_style.copy() - if system_prompt: - prompt_style.system_prompt = system_prompt - chat_history = chat_history or [] - full_prompt = cls.get_prompt(prompt, chat_history, prompt_style, tools=tools) - return full_prompt - def get_file_location( llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str @@ -903,3 +517,94 @@ def _decode_image(_url): return Image.open(_url).convert("RGB") else: return Image.open(BytesIO(response.content)).convert("RGB") + + +@typing.no_type_check +def generate_completion_chunk( + chunk_text: Optional[str], + finish_reason: Optional[str], + chunk_id: str, + model_uid: str, + prompt_tokens: int, + completion_tokens: int, + total_tokens: int, + has_choice: bool = True, + has_content: bool = True, +): + choices = [] + if has_choice: + choices.append( + CompletionChoice( + text=chunk_text, index=0, logprobs=None, finish_reason=finish_reason + ) + if has_content + else CompletionChoice(index=0, logprobs=None, finish_reason=finish_reason) + ) + return CompletionChunk( + id=chunk_id, + object="text_completion", + created=int(time.time()), + model=model_uid, + choices=choices, + usage=CompletionUsage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ), + ) + + +def generate_chat_completion( + model_uid: str, + response: str, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + finish_reason="stop", +) -> ChatCompletion: + return ChatCompletion( + id="chat" + str(uuid.uuid1()), + object="chat.completion", + created=int(time.time()), + model=model_uid, + choices=[ + ChatCompletionChoice( + index=0, + message={"role": "assistant", "content": response}, + finish_reason=finish_reason, + ) + ], + usage=CompletionUsage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ), + ) + + +@functools.lru_cache +def get_stop_token_ids_from_config_file(model_path: str) -> Optional[List[int]]: + from transformers import GenerationConfig as TransformersGenerationConfig + + transformers_config = TransformersGenerationConfig.from_pretrained(model_path) + if transformers_config.eos_token_id is not None: + stop_token_ids = ( + transformers_config.eos_token_id + if isinstance(transformers_config.eos_token_id, list) + else [transformers_config.eos_token_id] + ) + return stop_token_ids + return None + + +def parse_messages(messages: List[Dict]) -> Tuple: + """ + Some older models still follow the old way of parameter passing. + This function helps to parse out the needed information from OpenAI-compatible `messages`. + """ + system_messages = [mess["content"] for mess in messages if mess["role"] == "system"] + content_messages = [mess for mess in messages if mess["role"] != "system"] + prompt = content_messages[-1]["content"] + system_prompt = ". ".join(system_messages) if system_messages else None + chat_history = content_messages[:-1] + return prompt, system_prompt, chat_history diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 372efc7a3e..e97df3b8a1 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -13,7 +13,6 @@ # limitations under the License. import asyncio -import json import logging import multiprocessing import os @@ -24,9 +23,9 @@ Any, AsyncGenerator, Dict, - Iterable, List, Optional, + Tuple, TypedDict, Union, ) @@ -34,18 +33,20 @@ from ....types import ( ChatCompletion, ChatCompletionChunk, - ChatCompletionMessage, Completion, CompletionChoice, CompletionChunk, CompletionUsage, LoRA, - ToolCallFunction, - ToolCalls, ) from .. import LLM, LLMFamilyV1, LLMSpecV1 from ..llm_family import CustomLLMFamilyV1 -from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin +from ..utils import ( + QWEN_TOOL_CALL_FAMILY, + QWEN_TOOL_CALL_SYMBOLS, + ChatModelMixin, + generate_completion_chunk, +) logger = logging.getLogger(__name__) @@ -363,23 +364,28 @@ def match( @staticmethod def _convert_request_output_to_completion_chunk( request_id: str, model: str, request_output: "RequestOutput" - ) -> CompletionChunk: + ) -> Tuple[CompletionChunk, Optional[str]]: choices: List[CompletionChoice] = [] + finish_reason = None for output in request_output.outputs: choices.append( CompletionChoice( text=output.text, index=output.index, logprobs=None, # TODO: support logprobs. - finish_reason=output.finish_reason, + finish_reason=None, ) ) - return CompletionChunk( - id=request_id, - object="text_completion", - created=int(time.time()), - model=model, - choices=choices, + finish_reason = output.finish_reason + return ( + CompletionChunk( + id=request_id, + object="text_completion", + created=int(time.time()), + model=model, + choices=choices, + ), + finish_reason, ) @staticmethod @@ -463,10 +469,14 @@ async def async_generate( async def stream_results() -> AsyncGenerator[CompletionChunk, None]: previous_texts = [""] * sanitized_generate_config["n"] - tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family) prompt_tokens, completion_tokens, total_tokens = 0, 0, 0 + complete_response = "" + match_tool_call_tmp_results = [] + is_match_tool_call = False + chunk = None + finish_reason = None async for _request_output in results_generator: - chunk = self._convert_request_output_to_completion_chunk( + chunk, finish_reason = self._convert_request_output_to_completion_chunk( request_id=request_id, model=self.model_uid, request_output=_request_output, @@ -476,40 +486,8 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]: delta = choice["text"][len(previous_texts[i]) :] previous_texts[i] = choice["text"] choice["text"] = delta + complete_response += delta - if tools: - # only handle the first choice - choice = chunk["choices"][0] - if choice["finish_reason"] is not None: - # use previous text for evaluation temporarily - choice_delta = choice["text"] - choice["text"] = previous_texts[0] - _content, func, args = ChatModelMixin._eval_tool_arguments( - self.model_family, chunk, tools - ) - choice["text"] = tools_token_filter( - tokens=previous_texts[0], delta=choice_delta - ) - if func is not None: - choice["text"] = None - choice["finish_reason"] = "tool_calls" - choice["tool_calls"] = [ - ToolCalls( - id=str(uuid.uuid4()), - type="function", - function=ToolCallFunction( - name=func, - arguments=json.dumps(args, ensure_ascii=False), - ), - ) - ] - else: - # use a filter function to skip Qwen's react thought process - choice["text"] = tools_token_filter( - tokens=previous_texts[0], delta=choice["text"] - ) - if not choice["text"]: - continue prompt_tokens = len(_request_output.prompt_token_ids) completion_tokens = sum( len(output.token_ids) for output in _request_output.outputs @@ -520,7 +498,61 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]: completion_tokens=completion_tokens, total_tokens=total_tokens, ) + + if tools: + """ + The qwen2 tool call returns format like this: + + {...} + + Here is to match this. + """ + if (len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)) and ( + not QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response) + ): + for c in match_tool_call_tmp_results: + yield c + match_tool_call_tmp_results.clear() + yield chunk + elif (len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)) and ( + QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response) + ): + match_tool_call_tmp_results.append(chunk) + else: + assert len(QWEN_TOOL_CALL_SYMBOLS[0]) <= len(complete_response) + if not is_match_tool_call and complete_response.startswith( + QWEN_TOOL_CALL_SYMBOLS[0] + ): + is_match_tool_call = True + match_tool_call_tmp_results.clear() + + if not is_match_tool_call: + for c in match_tool_call_tmp_results: + yield c + match_tool_call_tmp_results.clear() + yield chunk + else: + chunk["choices"][0]["text"] = complete_response + else: + yield chunk + + if is_match_tool_call: + assert chunk is not None yield chunk + + # match OpenAI API stream + yield generate_completion_chunk( + chunk_text=None, + finish_reason=finish_reason, + chunk_id=request_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + has_choice=True, + has_content=False, + ) + if include_usage: chunk = CompletionChunk( id=request_id, @@ -586,59 +618,68 @@ def _sanitize_chat_config( ) -> Dict: if not generate_config: generate_config = {} - if self.model_family.prompt_style: - if ( - not generate_config.get("stop") - ) and self.model_family.prompt_style.stop: - generate_config["stop"] = self.model_family.prompt_style.stop.copy() - if self.model_family.prompt_style.stop_token_ids: - generate_config.setdefault( - "stop_token_ids", - self.model_family.prompt_style.stop_token_ids.copy(), - ) + if not generate_config.get("stop") and self.model_family.stop: + generate_config["stop"] = self.model_family.stop.copy() + if ( + not generate_config.get("stop_token_ids") + and self.model_family.stop_token_ids + ): + generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy() return generate_config + @staticmethod + def is_tool_call_chunk(chunk): + return chunk["choices"][0]["text"].startswith(QWEN_TOOL_CALL_SYMBOLS[0]) + + async def _async_to_tool_completion_chunks( + self, + chunks: AsyncGenerator[CompletionChunk, None], + ) -> AsyncGenerator[ChatCompletionChunk, None]: + i = 0 + async for chunk in chunks: + if i == 0: + yield self._get_first_chat_completion_chunk(chunk) + # usage + choices = chunk.get("choices") + if not choices: + yield self._get_final_chat_completion_chunk(chunk) + else: + if self.is_tool_call_chunk(chunk): + yield self._tool_calls_completion_chunk( + self.model_family, self.model_uid, chunk + ) + else: + yield self._to_chat_completion_chunk(chunk) + i += 1 + async def async_chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]: - assert self.model_family.prompt_style is not None - prompt_style = self.model_family.prompt_style.copy() - if system_prompt: - prompt_style.system_prompt = system_prompt - chat_history = chat_history or [] tools = generate_config.pop("tools", []) if generate_config else None - full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools) - - generate_config = self._sanitize_chat_config(generate_config) - # TODO(codingl2k1): qwen hacky to set stop for function call. model_family = self.model_family.model_family or self.model_family.model_name + full_context_kwargs = {} if tools and model_family in QWEN_TOOL_CALL_FAMILY: - stop = generate_config.get("stop") - if isinstance(stop, str): - generate_config["stop"] = [stop, "Observation:"] - elif isinstance(stop, Iterable): - assert not isinstance(stop, str) - generate_config["stop"] = list(stop) + ["Observation:"] - else: - generate_config["stop"] = "Observation:" + full_context_kwargs["tools"] = tools + full_prompt = self.get_full_context( + messages, self.model_family.chat_template, **full_context_kwargs + ) + generate_config = self._sanitize_chat_config(generate_config) stream = generate_config.get("stream", None) if stream: agen = await self.async_generate(full_prompt, generate_config, tools) assert isinstance(agen, AsyncGenerator) + if tools: + return self._async_to_tool_completion_chunks(agen) return self._async_to_chat_completion_chunks(agen) else: c = await self.async_generate(full_prompt, generate_config) assert not isinstance(c, AsyncGenerator) if tools: - return self._tool_calls_completion( - self.model_family, self.model_uid, c, tools - ) + return self._tool_calls_completion(self.model_family, self.model_uid, c) return self._to_chat_completion(c) @@ -666,28 +707,28 @@ def _sanitize_chat_config( self, generate_config: Optional[Dict] = None, ) -> Dict: + from ..utils import get_stop_token_ids_from_config_file + if not generate_config: generate_config = {} - if self.model_family.prompt_style: - if self.model_family.prompt_style.stop_token_ids: + if generate_config.get("stop_token_ids", None) is None: + stop_token_ids = get_stop_token_ids_from_config_file(self.model_path) + if stop_token_ids is not None: + generate_config.setdefault("stop_token_ids", stop_token_ids) + else: generate_config.setdefault( - "stop_token_ids", - self.model_family.prompt_style.stop_token_ids.copy(), + "stop_token_ids", self.model_family.stop_token_ids.copy() ) return generate_config async def async_chat( self, - prompt: str, - system_prompt: Optional[str] = None, - chat_history: Optional[List[ChatCompletionMessage]] = None, + messages: List[Dict], generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]: # only support single image, waiting vllm support multi images - assert self.model_family.prompt_style is not None - prompt_style = self.model_family.prompt_style.copy() - chat_history = chat_history or [] - prompt, images = self.get_prompt(prompt, chat_history, prompt_style) + model_family = self.model_family.model_family or self.model_family.model_name + prompt, images = self.get_specific_prompt(model_family, messages) if len(images) == 0: inputs = { diff --git a/xinference/types.py b/xinference/types.py index 3f636d94c3..fee7c54948 100644 --- a/xinference/types.py +++ b/xinference/types.py @@ -39,8 +39,6 @@ top_p_field, ) -SPECIAL_TOOL_PROMPT = "" - class Image(TypedDict): url: Optional[str] @@ -142,7 +140,7 @@ class ToolCalls(TypedDict): class CompletionChoice(TypedDict): - text: str + text: NotRequired[str] index: int logprobs: Optional[CompletionLogprobs] finish_reason: Optional[str] From 7ff5951f20599bc49c72a6a2f807a57b33b1ed83 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Thu, 29 Aug 2024 17:36:00 +0800 Subject: [PATCH 02/15] fix mypy --- xinference/core/supervisor.py | 2 +- xinference/model/image/stable_diffusion/core.py | 2 +- xinference/model/llm/llama_cpp/core.py | 1 + xinference/model/llm/mlx/core.py | 1 + xinference/model/llm/sglang/core.py | 1 + xinference/model/llm/transformers/core.py | 8 +++++++- xinference/model/llm/vllm/core.py | 8 +++++--- 7 files changed, 17 insertions(+), 6 deletions(-) diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py index 61fc4caa8e..1a522333d8 100644 --- a/xinference/core/supervisor.py +++ b/xinference/core/supervisor.py @@ -1027,7 +1027,7 @@ async def _launch_model(): else: task = asyncio.create_task(_launch_model()) ASYNC_LAUNCH_TASKS[model_uid] = task - task.add_done_callback(lambda _: callback_for_async_launch(model_uid)) + task.add_done_callback(lambda _: callback_for_async_launch(model_uid)) # type: ignore return model_uid async def get_instance_info( diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py index b00ee2de46..041774843e 100644 --- a/xinference/model/image/stable_diffusion/core.py +++ b/xinference/model/image/stable_diffusion/core.py @@ -198,7 +198,7 @@ def _gen_base64_image(_img): with ThreadPoolExecutor() as executor: results = list(map(partial(executor.submit, _gen_base64_image), images)) # type: ignore - image_list = [Image(url=None, b64_json=s.result()) for s in results] + image_list = [Image(url=None, b64_json=s.result()) for s in results] # type: ignore return ImageList(created=int(time.time()), data=image_list) else: raise ValueError(f"Unsupported response format: {response_format}") diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index 30a835ff7c..28b8df2402 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -277,6 +277,7 @@ def chat( full_context_kwargs = {} if tools and model_family in QWEN_TOOL_CALL_FAMILY: full_context_kwargs["tools"] = tools + assert self.model_family.chat_template is not None full_prompt = self.get_full_context( messages, self.model_family.chat_template, **full_context_kwargs ) diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index 07966fcbba..fd82c03798 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -379,6 +379,7 @@ def chat( full_context_kwargs = {} if tools and model_family in QWEN_TOOL_CALL_FAMILY: full_context_kwargs["tools"] = tools + assert self.model_family.chat_template is not None full_prompt = self.get_full_context( messages, self.model_family.chat_template, **full_context_kwargs ) diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 7d2566ee27..8e36cd193f 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -442,6 +442,7 @@ async def async_chat( messages: List[Dict], generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]: + assert self.model_family.chat_template is not None full_prompt = self.get_full_context(messages, self.model_family.chat_template) generate_config = self._sanitize_chat_config(generate_config) diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index fd7d75b22e..a6e5a14d0f 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -618,7 +618,11 @@ def get_builtin_stop_token_ids(self) -> Tuple: if stop_token_ids is not None: return tuple(stop_token_ids) else: - return tuple(self.model_family.stop_token_ids) + return ( + tuple(self.model_family.stop_token_ids) + if self.model_family.stop_token_ids + else tuple() + ) def handle_batch_inference_results(self, req_list: List[InferenceRequest]): for req in req_list: @@ -724,6 +728,7 @@ def chat( full_context_kwargs = {} if tools and model_family in QWEN_TOOL_CALL_FAMILY: full_context_kwargs["tools"] = tools + assert self.model_family.chat_template is not None full_prompt = self.get_full_context( messages, self.model_family.chat_template, @@ -749,6 +754,7 @@ def load(self): super().load() def _get_full_prompt(self, messages: List[Dict], tools): + assert self.model_family.chat_template is not None full_prompt = self.get_full_context( messages, self.model_family.chat_template, tokenizer=self._tokenizer ) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index e97df3b8a1..3b4d77f293 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -662,6 +662,7 @@ async def async_chat( full_context_kwargs = {} if tools and model_family in QWEN_TOOL_CALL_FAMILY: full_context_kwargs["tools"] = tools + assert self.model_family.chat_template is not None full_prompt = self.get_full_context( messages, self.model_family.chat_template, **full_context_kwargs ) @@ -716,9 +717,10 @@ def _sanitize_chat_config( if stop_token_ids is not None: generate_config.setdefault("stop_token_ids", stop_token_ids) else: - generate_config.setdefault( - "stop_token_ids", self.model_family.stop_token_ids.copy() - ) + if self.model_family.stop_token_ids: + generate_config.setdefault( + "stop_token_ids", self.model_family.stop_token_ids.copy() + ) return generate_config async def async_chat( From 3c43cffcdfadc0a44113981643417adfe62e9784 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Thu, 29 Aug 2024 18:47:25 +0800 Subject: [PATCH 03/15] fix UT --- xinference/client/tests/test_client.py | 69 +++++++++++++++------- xinference/model/llm/llama_cpp/core.py | 7 +-- xinference/model/llm/mlx/core.py | 6 +- xinference/model/llm/transformers/utils.py | 6 +- xinference/model/llm/utils.py | 6 +- 5 files changed, 59 insertions(+), 35 deletions(-) diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py index e6bd554129..fd785b034c 100644 --- a/xinference/client/tests/test_client.py +++ b/xinference/client/tests/test_client.py @@ -83,9 +83,14 @@ def _check_stream(): generate_config={"stream": True, "max_tokens": 5}, ) for chunk in streaming_response: - assert ("content" in chunk["choices"][0]["delta"]) or ( - "role" in chunk["choices"][0]["delta"] - ) + assert "finish_reason" in chunk["choices"][0] + finish_reason = chunk["choices"][0]["finish_reason"] + if finish_reason is None: + assert ("content" in chunk["choices"][0]["delta"]) or ( + "role" in chunk["choices"][0]["delta"] + ) + else: + assert chunk["choices"][0]["delta"] == {} _check_stream() @@ -218,7 +223,6 @@ def test_RESTful_client_custom_model(setup): "en", "zh" ], "model_ability": [ - "embed", "chat" ], "model_family": "other", @@ -234,15 +238,9 @@ def test_RESTful_client_custom_model(setup): "model_id": "ziqingyang/chinese-alpaca-2-7b" } ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.", - "roles": [ - "Instruction", - "Response" - ], - "intra_message_sep": "\\n\\n### " - } + "chat_template": "xyz", + "stop_token_ids": [], + "stop": [] }""" client.register_model(model_type="LLM", model=model, persist=False) @@ -266,7 +264,7 @@ def test_RESTful_client_custom_model(setup): custom_model_reg = model_reg assert custom_model_reg is None - # test register with string prompt style name + # test register with chat_template using model_family model_with_prompt = """{ "version": 1, "context_length":2048, @@ -291,12 +289,12 @@ def test_RESTful_client_custom_model(setup): "model_id": "ziqingyang/chinese-alpaca-2-7b" } ], - "prompt_style": "qwen-chat" + "chat_template": "qwen-chat" }""" client.register_model(model_type="LLM", model=model_with_prompt, persist=False) client.unregister_model(model_type="LLM", model_name="custom_model") - model_with_prompt2 = """{ + model_with_vision = """{ "version": 1, "context_length":2048, "model_name": "custom_model", @@ -304,8 +302,8 @@ def test_RESTful_client_custom_model(setup): "en", "zh" ], "model_ability": [ - "embed", - "chat" + "chat", + "vision" ], "model_family": "other", "model_specs": [ @@ -320,10 +318,41 @@ def test_RESTful_client_custom_model(setup): "model_id": "ziqingyang/chinese-alpaca-2-7b" } ], - "prompt_style": "xyz123" + "chat_template": "xyz123" }""" with pytest.raises(RuntimeError): - client.register_model(model_type="LLM", model=model_with_prompt2, persist=False) + client.register_model(model_type="LLM", model=model_with_vision, persist=False) + + model_with_tool_call = """{ + "version": 1, + "context_length":2048, + "model_name": "custom_model", + "model_lang": [ + "en", "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_family": "other", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "ziqingyang/chinese-alpaca-2-7b" + } + ], + "chat_template": "xyz123" + }""" + with pytest.raises(RuntimeError): + client.register_model( + model_type="LLM", model=model_with_tool_call, persist=False + ) def test_client_from_modelscope(setup): diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index 28b8df2402..8e4929cbfe 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -182,13 +182,10 @@ def generator_wrapper( ): _completion_chunk["model"] = self.model_uid request_id = _completion_chunk["id"] - choice = _completion_chunk["choices"][0] - if choice["finish_reason"] is not None: - completion_tokens = index - choice.pop("text", None) + completion_tokens = index + 1 total_tokens = prompt_tokens + completion_tokens _completion_chunk["usage"] = CompletionUsage( - prompt_tokens=total_tokens, + prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, ) diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index fd82c03798..23bb19ed11 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -245,15 +245,13 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig): ) if stream: yield generate_completion_chunk( - None, + "", finish_reason=finish_reason, chunk_id=chunk_id, model_uid=model_uid, prompt_tokens=input_echo_len, completion_tokens=i, total_tokens=(input_echo_len + i), - has_choice=True, - has_content=False, ), completion_usage else: yield generate_completion_chunk( @@ -264,8 +262,6 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig): prompt_tokens=input_echo_len, completion_tokens=i, total_tokens=(input_echo_len + i), - has_choice=True, - has_content=True, ), completion_usage if include_usage: diff --git a/xinference/model/llm/transformers/utils.py b/xinference/model/llm/transformers/utils.py index d34112d24f..ed07f1b65e 100644 --- a/xinference/model/llm/transformers/utils.py +++ b/xinference/model/llm/transformers/utils.py @@ -321,7 +321,7 @@ def generate_stream( if stream: completion_choice = CompletionChoice( - index=0, logprobs=None, finish_reason=finish_reason + text=output, index=0, logprobs=None, finish_reason=finish_reason ) else: completion_choice = CompletionChoice( @@ -692,15 +692,13 @@ def _batch_inference_one_step_internal( if r.stopped: # OpenAI compatible chunk completion_chunk = generate_completion_chunk( - chunk_text=None, + chunk_text="", finish_reason=r.finish_reason, chunk_id=r.chunk_id, model_uid=model_uid, prompt_tokens=len(r.prompt_tokens), completion_tokens=len(r.new_tokens), total_tokens=len(r.prompt_tokens) + len(r.new_tokens), - has_choice=True, - has_content=False, ) r.completion.append(completion_chunk) r.completion.append(eos_flag) diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index 974671720e..5b9c5fc70e 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -191,7 +191,11 @@ def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChun { "index": i, "delta": { - **({"content": choice["text"]} if "text" in choice else {}), + **( + {"content": choice["text"]} + if ("text" in choice and choice["finish_reason"] is None) + else {} + ), **( {"tool_calls": choice["tool_calls"]} if "tool_calls" in choice From dfe1dffb33e5e7e80663190feaf880e2cb807823 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Thu, 29 Aug 2024 18:56:38 +0800 Subject: [PATCH 04/15] fix --- xinference/model/llm/mlx/core.py | 19 +++++-------------- xinference/model/llm/sglang/core.py | 4 +--- xinference/model/llm/vllm/core.py | 4 +--- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index 23bb19ed11..7553cd64b4 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -24,7 +24,6 @@ ChatCompletion, ChatCompletionChunk, Completion, - CompletionChoice, CompletionChunk, CompletionUsage, LoRA, @@ -211,24 +210,16 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig): else: output += out - completion_choice = CompletionChoice( - text=output, index=0, logprobs=None, finish_reason=None - ) - completion_chunk = CompletionChunk( - id=chunk_id, - object="text_completion", - created=int(time.time()), - model=model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( + yield generate_completion_chunk( + chunk_text=output, + finish_reason=None, + chunk_id=chunk_id, + model_uid=model_uid, prompt_tokens=input_echo_len, completion_tokens=i, total_tokens=(input_echo_len + i), ) - yield completion_chunk, completion_usage - logger.info( f"Average generation speed: {i / (time.time() - start):.2f} tokens/s." ) diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 8e36cd193f..b2b830d23c 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -373,15 +373,13 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]: else finish_reason ) yield generate_completion_chunk( - None, + "", finish_reason=finish_reason, chunk_id=request_id, model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, - has_choice=True, - has_content=False, ) if include_usage: diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 3b4d77f293..3a142c7de7 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -542,15 +542,13 @@ async def stream_results() -> AsyncGenerator[CompletionChunk, None]: # match OpenAI API stream yield generate_completion_chunk( - chunk_text=None, + chunk_text="", finish_reason=finish_reason, chunk_id=request_id, model_uid=self.model_uid, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, - has_choice=True, - has_content=False, ) if include_usage: From ecdb930f95908b621d3632f77df471ec8c7624fc Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 30 Aug 2024 11:03:26 +0800 Subject: [PATCH 05/15] fix mlx UT --- xinference/model/llm/mlx/core.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index 7553cd64b4..d01324fbf5 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -210,6 +210,12 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig): else: output += out + completion_usage = CompletionUsage( + prompt_tokens=input_echo_len, + completion_tokens=i, + total_tokens=(input_echo_len + i), + ) + yield generate_completion_chunk( chunk_text=output, finish_reason=None, @@ -218,7 +224,7 @@ def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig): prompt_tokens=input_echo_len, completion_tokens=i, total_tokens=(input_echo_len + i), - ) + ), completion_usage logger.info( f"Average generation speed: {i / (time.time() - start):.2f} tokens/s." From 0b01c31d7fea925f4ecd8be9c416351e87d2ee5f Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 30 Aug 2024 11:16:45 +0800 Subject: [PATCH 06/15] fix UT --- xinference/client/tests/test_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py index fd785b034c..62f91a87ec 100644 --- a/xinference/client/tests/test_client.py +++ b/xinference/client/tests/test_client.py @@ -73,7 +73,7 @@ def test_RESTful_client(setup): with pytest.raises(RuntimeError): completion = model.chat({"max_tokens": 64}) - messages = {"role": "user", "content": "What is the capital of France?"} + messages = [{"role": "user", "content": "What is the capital of France?"}] completion = model.chat(messages) assert "content" in completion["choices"][0]["message"] From a8adee41f58ab5019de81dfc2f1dc35ff83062a6 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 30 Aug 2024 15:20:05 +0800 Subject: [PATCH 07/15] fix continous batching streaming format issue when chat model calls generate interface --- xinference/core/model.py | 12 ++++++++---- xinference/core/scheduler.py | 22 ++++++++++++++-------- xinference/model/llm/transformers/core.py | 14 ++++++++++++++ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/xinference/core/model.py b/xinference/core/model.py index 10ab759fe6..cec574ab6d 100644 --- a/xinference/core/model.py +++ b/xinference/core/model.py @@ -439,7 +439,9 @@ async def _call_wrapper(self, output_type: str, fn: Callable, *args, **kwargs): @log_async(logger=logger) async def generate(self, prompt: str, *args, **kwargs): if self.allow_batching(): - return await self.handle_batching_request(prompt, *args, **kwargs) + return await self.handle_batching_request( + prompt, "generate", *args, **kwargs + ) else: kwargs.pop("raw_params", None) if hasattr(self._model, "generate"): @@ -484,7 +486,7 @@ def _get_stream_from_args(*args) -> bool: return False if args[0] is None else args[0].get("stream", False) async def handle_batching_request( - self, prompt_or_messages: Union[str, List[Dict]], *args, **kwargs + self, prompt_or_messages: Union[str, List[Dict]], call_ability, *args, **kwargs ): """ The input parameter `prompt_or_messages`: @@ -498,7 +500,7 @@ async def handle_batching_request( queue: Queue[Any] = Queue() ret = self._queue_consumer(queue) await self._scheduler_ref.add_request( - prompt_or_messages, queue, *args, **kwargs + prompt_or_messages, queue, call_ability, *args, **kwargs ) gen = self._to_async_gen("json", ret) self._current_generator = weakref.ref(gen) @@ -527,7 +529,9 @@ async def chat(self, messages: List[Dict], *args, **kwargs): response = None try: if self.allow_batching(): - return await self.handle_batching_request(messages, *args, **kwargs) + return await self.handle_batching_request( + messages, "chat", *args, **kwargs + ) else: kwargs.pop("raw_params", None) if hasattr(self._model, "chat"): diff --git a/xinference/core/scheduler.py b/xinference/core/scheduler.py index 842b8bd737..6f4af5bfc9 100644 --- a/xinference/core/scheduler.py +++ b/xinference/core/scheduler.py @@ -38,7 +38,13 @@ class AbortRequestMessage(Enum): class InferenceRequest: def __init__( - self, prompt_or_messages, future_or_queue, is_prefill, *args, **kwargs + self, + prompt_or_messages, + future_or_queue, + is_prefill, + call_ability, + *args, + **kwargs, ): # original prompt, prompt(str) for generate model and messages(List[Dict]) for chat model self._prompt = prompt_or_messages @@ -46,6 +52,9 @@ def __init__( self._full_prompt = None # whether the current request is in the prefill phase self._is_prefill = is_prefill + # the ability that the user calls this model for, that is `generate` / `chat` for now, + # which is for results formatting + self._call_ability = call_ability # full prompt tokens self._prompt_tokens = None # all new generated tokens during decode phase @@ -104,12 +113,8 @@ def prompt(self): return self._prompt @property - def system_prompt(self): - return self._inference_args[0] - - @property - def chat_history(self): - return self._inference_args[1] + def call_ability(self): + return self._call_ability @property def full_prompt(self): @@ -413,11 +418,12 @@ async def add_request( self, prompt_or_messages: Union[str, List[Dict]], future_or_queue, + call_ability, *args, **kwargs, ): req = InferenceRequest( - prompt_or_messages, future_or_queue, True, *args, **kwargs + prompt_or_messages, future_or_queue, True, call_ability, *args, **kwargs ) rid = req.request_id if rid is not None: diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index a6e5a14d0f..05feced99d 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -774,6 +774,20 @@ def prepare_batch_inference(self, req_list: List[InferenceRequest]): def handle_batch_inference_results(self, req_list: List[InferenceRequest]): for req in req_list: if req.error_msg is None and req.completion: + # The `generate` function can be called for some chat models. + # So that we cannot convert completion chunk to chat completion chunk. + if req.call_ability == "generate": + results = [] + for c in req.completion: + if c == "": + continue + elif c == "": + break + else: + results.append(c) + req.completion = results + continue + if req.stream: results = [] for i, c in enumerate(req.completion): From 155715d4491c19bc1730ca8358dd0782e9fbdb3b Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 30 Aug 2024 16:05:53 +0800 Subject: [PATCH 08/15] fix UT --- xinference/core/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/core/model.py b/xinference/core/model.py index cec574ab6d..c8e73c3060 100644 --- a/xinference/core/model.py +++ b/xinference/core/model.py @@ -511,7 +511,7 @@ async def handle_batching_request( assert self._loop is not None future = ConcurrentFuture() await self._scheduler_ref.add_request( - prompt_or_messages, future, *args, **kwargs + prompt_or_messages, future, call_ability, *args, **kwargs ) fut = asyncio.wrap_future(future, loop=self._loop) result = await fut From 86a32bb34d971b1c7b57d3496c6214b0abf164c3 Mon Sep 17 00:00:00 2001 From: yiboyasss <3359595624@qq.com> Date: Wed, 4 Sep 2024 16:45:33 +0800 Subject: [PATCH 09/15] ui: register page --- xinference/web/ui/package-lock.json | 38 ++ xinference/web/ui/package.json | 1 + .../register_model/components/addStop.js | 107 ++++ .../web/ui/src/scenes/register_model/index.js | 3 +- .../scenes/register_model/registerModel.js | 489 ++++++++++-------- .../styles/registerModelStyle.css | 23 + 6 files changed, 452 insertions(+), 209 deletions(-) create mode 100644 xinference/web/ui/src/scenes/register_model/components/addStop.js diff --git a/xinference/web/ui/package-lock.json b/xinference/web/ui/package-lock.json index 0730d3b275..7f15648e74 100644 --- a/xinference/web/ui/package-lock.json +++ b/xinference/web/ui/package-lock.json @@ -29,6 +29,7 @@ "@testing-library/user-event": "^13.5.0", "clipboard": "^2.0.11", "formik": "^2.4.2", + "nunjucks": "^3.2.4", "prop-types": "^15.8.1", "react": "^18.2.0", "react-cookie": "^6.1.1", @@ -5799,6 +5800,11 @@ "resolved": "https://registry.npmjs.org/@xtuc/long/-/long-4.2.2.tgz", "integrity": "sha512-NuHqBY1PB/D8xU6s/thBgOAiAP7HOYDQ32+BFZILJ8ivkUkAHQnWfn6WhL79Owj1qmUnoN/YPhktdIoucipkAQ==" }, + "node_modules/a-sync-waterfall": { + "version": "1.0.1", + "resolved": "https://registry.npmmirror.com/a-sync-waterfall/-/a-sync-waterfall-1.0.1.tgz", + "integrity": "sha512-RYTOHHdWipFUliRFMCS4X2Yn2X8M87V/OpSqWzKKOGhzqyUxzyVmhHDH9sAvG+ZuQf/TAOFsLCpMw09I1ufUnA==" + }, "node_modules/abab": { "version": "2.0.6", "resolved": "https://registry.npmjs.org/abab/-/abab-2.0.6.tgz", @@ -13750,6 +13756,38 @@ "url": "https://github.com/fb55/nth-check?sponsor=1" } }, + "node_modules/nunjucks": { + "version": "3.2.4", + "resolved": "https://registry.npmmirror.com/nunjucks/-/nunjucks-3.2.4.tgz", + "integrity": "sha512-26XRV6BhkgK0VOxfbU5cQI+ICFUtMLixv1noZn1tGU38kQH5A5nmmbk/O45xdyBhD1esk47nKrY0mvQpZIhRjQ==", + "dependencies": { + "a-sync-waterfall": "^1.0.0", + "asap": "^2.0.3", + "commander": "^5.1.0" + }, + "bin": { + "nunjucks-precompile": "bin/precompile" + }, + "engines": { + "node": ">= 6.9.0" + }, + "peerDependencies": { + "chokidar": "^3.3.0" + }, + "peerDependenciesMeta": { + "chokidar": { + "optional": true + } + } + }, + "node_modules/nunjucks/node_modules/commander": { + "version": "5.1.0", + "resolved": "https://registry.npmmirror.com/commander/-/commander-5.1.0.tgz", + "integrity": "sha512-P0CysNDQ7rtVw4QIQtm+MRxV66vKFSvlsQvGYXZWR3qFU0jlMKHZZZgw8e+8DSah4UDKMqnknRDQz+xuQXQ/Zg==", + "engines": { + "node": ">= 6" + } + }, "node_modules/nwsapi": { "version": "2.2.7", "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.7.tgz", diff --git a/xinference/web/ui/package.json b/xinference/web/ui/package.json index 0a163ec52b..1bda015ba8 100644 --- a/xinference/web/ui/package.json +++ b/xinference/web/ui/package.json @@ -25,6 +25,7 @@ "@testing-library/user-event": "^13.5.0", "clipboard": "^2.0.11", "formik": "^2.4.2", + "nunjucks": "^3.2.4", "prop-types": "^15.8.1", "react": "^18.2.0", "react-cookie": "^6.1.1", diff --git a/xinference/web/ui/src/scenes/register_model/components/addStop.js b/xinference/web/ui/src/scenes/register_model/components/addStop.js new file mode 100644 index 0000000000..0acca09981 --- /dev/null +++ b/xinference/web/ui/src/scenes/register_model/components/addStop.js @@ -0,0 +1,107 @@ +import AddIcon from '@mui/icons-material/Add' +import DeleteIcon from '@mui/icons-material/Delete' +import { Alert, Button, TextField } from '@mui/material' +import React, { useEffect, useState } from 'react' + +const regex = /^[1-9]\d*$/ + +const AddStop = ({ label, onGetData, arrItemType, formData, onGetError }) => { + const [dataArr, setDataArr] = useState(formData?.length ? formData : ['']) + const arr = [] + + useEffect(() => { + if (arrItemType === 'number') { + const newDataArr = dataArr.map((item) => { + if (item && regex.test(item)) { + arr.push('true') + return Number(item) + } + if (item && !regex.test(item)) arr.push('false') + return item + }) + onGetError(arr) + onGetData(newDataArr) + } else { + onGetData(dataArr) + } + }, [dataArr]) + + const handleChange = (value, index) => { + const arr = [...dataArr] + arr[index] = value + setDataArr([...arr]) + } + + const handleAdd = () => { + if (dataArr[dataArr.length - 1]) { + setDataArr([...dataArr, '']) + } + } + + const handleDelete = (index) => { + setDataArr(dataArr.filter((_, subIndex) => index !== subIndex)) + } + + const handleShowAlert = (item) => { + return item !== '' && !regex.test(item) && arrItemType === 'number' + } + + return ( + <> +
+
+ + +
+
+ {dataArr.map((item, index) => ( +
+
+ handleChange(e.target.value, index)} + size="small" + style={{ width: '100%' }} + /> + {dataArr.length > 1 && ( + handleDelete(index)} + style={{ cursor: 'pointer', color: '#1976d2' }} + /> + )} +
+ + {handleShowAlert(item) && ( + + Please enter an integer greater than 0. + + )} +
+ ))} +
+
+ + ) +} + +export default AddStop diff --git a/xinference/web/ui/src/scenes/register_model/index.js b/xinference/web/ui/src/scenes/register_model/index.js index eb5b0a9e77..6aa0146bc9 100644 --- a/xinference/web/ui/src/scenes/register_model/index.js +++ b/xinference/web/ui/src/scenes/register_model/index.js @@ -63,7 +63,6 @@ const RegisterModel = () => { context_length: 2048, model_lang: ['en'], model_ability: ['generate'], - model_family: '', model_specs: [ { model_uri: '/path/to/llama-1', @@ -72,7 +71,7 @@ const RegisterModel = () => { quantizations: ['none'], }, ], - prompt_style: undefined, + model_family: '', }} /> diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js index 06cc582927..717587d6d6 100644 --- a/xinference/web/ui/src/scenes/register_model/registerModel.js +++ b/xinference/web/ui/src/scenes/register_model/registerModel.js @@ -1,14 +1,20 @@ import './styles/registerModelStyle.css' -import CheckIcon from '@mui/icons-material/Check' +import Cancel from '@mui/icons-material/Cancel' +import CheckCircleIcon from '@mui/icons-material/CheckCircle' import KeyboardDoubleArrowRightIcon from '@mui/icons-material/KeyboardDoubleArrowRight' import NotesIcon from '@mui/icons-material/Notes' +import OpenInFullIcon from '@mui/icons-material/OpenInFull' import { Alert, Box, Button, Checkbox, Chip, + Dialog, + DialogActions, + DialogContent, + DialogTitle, FormControl, FormControlLabel, InputLabel, @@ -21,6 +27,7 @@ import { TextField, Tooltip, } from '@mui/material' +import nunjucks from 'nunjucks' import React, { useContext, useEffect, useRef, useState } from 'react' import { useCookies } from 'react-cookie' import { useNavigate, useParams } from 'react-router-dom' @@ -31,22 +38,27 @@ import fetchWrapper from '../../components/fetchWrapper' import { isValidBearerToken } from '../../components/utils' import AddControlnet from './components/addControlnet' import AddModelSpecs from './components/addModelSpecs' +import AddStop from './components/addStop' import languages from './data/languages' const SUPPORTED_LANGUAGES_DICT = { en: 'English', zh: 'Chinese' } const SUPPORTED_FEATURES = ['Generate', 'Chat', 'Vision'] +const messages = [ + { + role: 'assistant', + content: 'This is the message content replied by the assistant previously', + }, + { + role: 'user', + content: 'This is the message content sent by the user currently', + }, +] // Convert dictionary of supported languages into list const SUPPORTED_LANGUAGES = Object.keys(SUPPORTED_LANGUAGES_DICT) const RegisterModelComponent = ({ modelType, customData }) => { - const endPoint = useContext(ApiContext).endPoint const { setErrorMsg } = useContext(ApiContext) const [formData, setFormData] = useState(customData) - const [promptStyles, setPromptStyles] = useState([]) - const [family, setFamily] = useState({ - chat: [], - generate: [], - }) const [languagesArr, setLanguagesArr] = useState([]) const [isContextLengthAlert, setIsContextLengthAlert] = useState(false) const [isDimensionsAlert, setIsDimensionsAlert] = useState(false) @@ -73,6 +85,11 @@ const RegisterModelComponent = ({ modelType, customData }) => { ) const [contrastObj, setContrastObj] = useState({}) const [isEqual, setIsEqual] = useState(true) + const [testRes, setTestRes] = useState('') + const [isOpenMessages, setIsOpenMessages] = useState(false) + const [testErrorInfo, setTestErrorInfo] = useState('') + const [isTestSuccess, setIsTestSuccess] = useState(false) + const [isStopTokenIdsAlert, setIsStopTokenIdsAlert] = useState(false) useEffect(() => { if (model_name) { @@ -93,7 +110,9 @@ const RegisterModelComponent = ({ modelType, customData }) => { model_ability, model_family, model_specs, - prompt_style, + chat_template, + stop_token_ids, + stop, } = data const specsDataArr = model_specs.map((item) => { const { @@ -120,8 +139,10 @@ const RegisterModelComponent = ({ modelType, customData }) => { model_ability, model_family, model_specs: specsDataArr, + chat_template, + stop_token_ids, + stop, } - prompt_style ? (llmData.prompt_style = prompt_style) : '' setFormData(llmData) setContrastObj(llmData) setSpecsArr(specsDataArr) @@ -217,79 +238,6 @@ const RegisterModelComponent = ({ modelType, customData }) => { navigate('/login', { replace: true }) return } - - const getBuiltinFamilies = async () => { - const response = await fetch(endPoint + '/v1/models/families', { - method: 'GET', - headers: { - 'Content-Type': 'application/json', - }, - }) - if (!response.ok) { - const errorData = await response.json() // Assuming the server returns error details in JSON format - setErrorMsg( - `Server error: ${response.status} - ${ - errorData.detail || 'Unknown error' - }` - ) - } else { - const data = await response.json() - data.chat.push('other') - data.generate.push('other') - setFamily(data) - } - } - - const getBuiltInPromptStyles = async () => { - const response = await fetch(endPoint + '/v1/models/prompts', { - method: 'GET', - headers: { - 'Content-Type': 'application/json', - }, - }) - if (!response.ok) { - const errorData = await response.json() // Assuming the server returns error details in JSON format - setErrorMsg( - `Server error: ${response.status} - ${ - errorData.detail || 'Unknown error' - }` - ) - } else { - const data = await response.json() - let res = [] - for (const key in data) { - let v = data[key] - v['name'] = key - res.push(v) - } - setPromptStyles(res) - } - } - - if ( - Object.prototype.hasOwnProperty.call(customData, 'model_ability') && - Object.prototype.hasOwnProperty.call(customData, 'model_family') - ) { - // avoid keep requesting backend to get prompts - if (promptStyles.length === 0) { - getBuiltInPromptStyles().catch((error) => { - setErrorMsg( - error.message || - 'An unexpected error occurred when getting builtin prompt styles.' - ) - console.error('Error: ', error) - }) - } - if (family.chat.length === 0) { - getBuiltinFamilies().catch((error) => { - setErrorMsg( - error.message || - 'An unexpected error occurred when getting builtin prompt styles.' - ) - console.error('Error: ', error) - }) - } - } }, [cookie.token]) useEffect(() => { @@ -299,34 +247,7 @@ const RegisterModelComponent = ({ modelType, customData }) => { } }, [formData]) - const getFamilyByAbility = () => { - if ( - formData.model_ability.includes('chat') || - formData.model_ability.includes('vision') - ) { - return family.chat - } else { - return family.generate - } - } - - const sortStringsByFirstLetter = (arr) => { - return arr.sort((a, b) => { - const firstCharA = a.charAt(0).toLowerCase() - const firstCharB = b.charAt(0).toLowerCase() - if (firstCharA < firstCharB) { - return -1 - } - if (firstCharA > firstCharB) { - return 1 - } - return 0 - }) - } - const handleClick = async () => { - console.log('formData', modelType, formData) - for (let key in formData) { const type = Object.prototype.toString.call(formData[key]).slice(8, -1) if ( @@ -427,61 +348,26 @@ const RegisterModelComponent = ({ modelType, customData }) => { } const toggleAbility = (ability) => { + const obj = JSON.parse(JSON.stringify(formData)) if (formData.model_ability.includes(ability)) { - const obj = JSON.parse(JSON.stringify(formData)) if (ability === 'chat') { - delete obj.prompt_style + delete obj.chat_template + delete obj.stop_token_ids + delete obj.stop } setFormData({ ...obj, model_ability: formData.model_ability.filter((a) => a !== ability), - model_family: '', }) } else { - setFormData({ - ...formData, - model_ability: [...formData.model_ability, ability], - model_family: '', - }) - } - } - - const toggleFamily = (value) => { - const ps = promptStyles.find((item) => item.name === value) - if (formData.model_ability.includes('chat') && ps) { - const prompt_style = { - style_name: ps.style_name, - system_prompt: ps.system_prompt, - roles: ps.roles, - intra_message_sep: ps.intra_message_sep, - inter_message_sep: ps.inter_message_sep, - stop: ps.stop ?? null, - stop_token_ids: ps.stop_token_ids ?? null, + if (ability === 'chat') { + obj.chat_template = '' + obj.stop_token_ids = [] + obj.stop = [] } setFormData({ - ...formData, - model_family: value, - prompt_style, - }) - } else { - const { - version, - model_name, - model_description, - context_length, - model_lang, - model_ability, - model_specs, - } = formData - setFormData({ - version, - model_name, - model_description, - context_length, - model_lang, - model_ability, - model_family: value, - model_specs, + ...obj, + model_ability: [...formData.model_ability, ability], }) } } @@ -569,6 +455,58 @@ const RegisterModelComponent = ({ modelType, customData }) => { return true } + const handleTest = () => { + setTestRes('') + if (formData.chat_template) { + try { + nunjucks.configure({ autoescape: false }) + const test_res = nunjucks.renderString(formData.chat_template, { + messages: messages, + }) + if (test_res === '') { + setTestRes(test_res) + setTestErrorInfo('error') + setIsTestSuccess(false) + } else { + setTestRes(test_res) + setTestErrorInfo('') + setIsTestSuccess(true) + } + } catch (error) { + setTestErrorInfo(`${error}`) + setIsTestSuccess(false) + } + } + } + + const getStopTokenIds = (value) => { + if (value.length === 1 && value[0] === '') { + setFormData({ + ...formData, + stop_token_ids: [], + }) + } else { + setFormData({ + ...formData, + stop_token_ids: value, + }) + } + } + + const getStop = (value) => { + if (value.length === 1 && value[0] === '') { + setFormData({ + ...formData, + stop: [], + }) + } else { + setFormData({ + ...formData, + stop: value, + }) + } + } + return (
@@ -845,66 +783,162 @@ const RegisterModelComponent = ({ modelType, customData }) => { {/* family */} {(customData.model_family === '' || customData.model_family) && ( - - - {modelType === 'LLM' && formData.model_family && ( - } - severity="success" - > - Please be careful to select the family name corresponding to - the model you want to register. If not found, please choose - other - . - - )} - {modelType === 'LLM' && !formData.model_family && ( - - Please be careful to select the family name corresponding to - the model you want to register. If not found, please choose - other - . - + <> + {modelType === 'LLM' && ( + <> + + setFormData({ + ...formData, + model_family: event.target.value, + }) + } + /> + + )} - { - toggleFamily(e.target.value) - }} - > - - {modelType === 'LLM' && - sortStringsByFirstLetter(getFamilyByAbility()).map((v) => ( - + {(modelType === 'image' || modelType === 'audio') && ( + <> + + + + } - label={v} + label={formData.model_family} /> - ))} - {(modelType === 'image' || modelType === 'audio') && ( - } - label={formData.model_family} + + + + + )} + + )} + + {/* chat_template */} + {formData.model_ability?.includes('chat') && ( + <> +
+ + setFormData({ + ...formData, + chat_template: event.target.value, + }) + } + style={{ flex: 1 }} + /> + +
+
+ messages example + setIsOpenMessages(true)} + style={{ fontSize: 14, color: '#666', cursor: 'pointer' }} /> - )} - - +
+
+ + test result + {testErrorInfo ? ( + + ) : testRes ? ( + + ) : ( + '' + )} + +
+ {testErrorInfo !== '' + ? testErrorInfo + : testRes + ? testRes + : 'No test results...'} +
+
+
+
+ + + )} + + {/* stop_token_ids */} + {formData.model_ability?.includes('chat') && ( + <> + { + if (value.includes('false')) { + setIsStopTokenIdsAlert(true) + } else { + setIsStopTokenIdsAlert(false) + } + }} + /> -
+ + )} + + {/* stop */} + {formData.model_ability?.includes('chat') && ( + <> + + + )} {/* specs */} @@ -1011,6 +1045,21 @@ const RegisterModelComponent = ({ modelType, customData }) => { color="primary" type="submit" onClick={handleClick} + disabled={ + isContextLengthAlert || + isDimensionsAlert || + isMaxTokensAlert || + formData.model_lang?.length === 0 || + formData.language?.length === 0 || + formData.model_ability?.length === 0 || + (modelType === 'LLM' && !formData.model_family) || + (formData.model_ability?.includes('chat') && + !formData.chat_template) || + (formData.model_ability?.includes('chat') && + formData.chat_template && + !isTestSuccess) || + isStopTokenIdsAlert + } > Register Model @@ -1018,6 +1067,32 @@ const RegisterModelComponent = ({ modelType, customData }) => { )}
+ setIsOpenMessages(false)} + aria-labelledby="alert-dialog-title" + aria-describedby="alert-dialog-description" + > + Messages Example + +