From 931aa6b0820da6499217678918400b3d6887cc68 Mon Sep 17 00:00:00 2001
From: Uranus <109661872+UranusSeven@users.noreply.github.com>
Date: Wed, 6 Sep 2023 22:16:44 +0800
Subject: [PATCH] FEAT: support baichuan2 (#425)

---
 README.md                                     |  2 +
 README_ja_JP.md                               |  2 +
 README_zh_CN.md                               |  2 +
 doc/source/models/builtin/baichuan-2-chat.rst | 51 ++++++++++
 doc/source/models/builtin/baichuan-2.rst      | 48 ++++++++++
 doc/source/models/builtin/index.rst           |  4 +
 xinference/model/llm/llm_family.json          | 92 +++++++++++++++++++
 xinference/model/llm/pytorch/baichuan.py      |  2 +-
 xinference/model/llm/pytorch/core.py          |  1 +
 9 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 doc/source/models/builtin/baichuan-2-chat.rst
 create mode 100644 doc/source/models/builtin/baichuan-2.rst

diff --git a/README.md b/README.md
index fd5e072468..1354acae0a 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,9 @@ $ xinference registrations
 | Type | Name                | Language     | Ability                |
 |------|---------------------|--------------|------------------------|
 | LLM  | baichuan            | ['en', 'zh'] | ['embed', 'generate']  |
+| LLM  | baichuan-2          | ['en', 'zh'] | ['embed', 'generate']  |
 | LLM  | baichuan-chat       | ['en', 'zh'] | ['embed', 'chat']      |
+| LLM  | baichuan-2-chat     | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm             | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm2            | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm2-32k        | ['en', 'zh'] | ['embed', 'chat']      |
diff --git a/README_ja_JP.md b/README_ja_JP.md
index bb530a0859..9838d8d23f 100644
--- a/README_ja_JP.md
+++ b/README_ja_JP.md
@@ -174,7 +174,9 @@ $ xinference registrations
 | Type | Name                | Language     | Ability                |
 |------|---------------------|--------------|------------------------|
 | LLM  | baichuan            | ['en', 'zh'] | ['embed', 'generate']  |
+| LLM  | baichuan-2          | ['en', 'zh'] | ['embed', 'generate']  |
 | LLM  | baichuan-chat       | ['en', 'zh'] | ['embed', 'chat']      |
+| LLM  | baichuan-2-chat     | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm             | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm2            | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm2-32k        | ['en', 'zh'] | ['embed', 'chat']      |
diff --git a/README_zh_CN.md b/README_zh_CN.md
index 7960d061ad..71e020e3fd 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -177,7 +177,9 @@ $ xinference registrations
 | Type | Name                | Language     | Ability                |
 |------|---------------------|--------------|------------------------|
 | LLM  | baichuan            | ['en', 'zh'] | ['embed', 'generate']  |
+| LLM  | baichuan-2          | ['en', 'zh'] | ['embed', 'generate']  |
 | LLM  | baichuan-chat       | ['en', 'zh'] | ['embed', 'chat']      |
+| LLM  | baichuan-2-chat     | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm             | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm2            | ['en', 'zh'] | ['embed', 'chat']      |
 | LLM  | chatglm2-32k        | ['en', 'zh'] | ['embed', 'chat']      |
diff --git a/doc/source/models/builtin/baichuan-2-chat.rst b/doc/source/models/builtin/baichuan-2-chat.rst
new file mode 100644
index 0000000000..0bbeb8affc
--- /dev/null
+++ b/doc/source/models/builtin/baichuan-2-chat.rst
@@ -0,0 +1,51 @@
+.. _models_builtin_baichuan_2_chat:
+
+===============
+Baichuan-2-Chat
+===============
+
+- **Context Length:** 4096
+- **Model Name:** baichuan-2-chat
+- **Languages:** en, zh
+- **Abilities:** embed, generate, chat
+- **Description:** Baichuan2-chat is a fine-tuned version of the Baichuan LLM, specializing in chatting.
+
+Specifications
+^^^^^^^^^^^^^^
+
+Model Spec 1 (pytorch, 7 Billion)
++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** baichuan-inc/Baichuan2-7B-Chat
+- **Model Revision:** 2ce891951e000c36c65442608a0b95fd09b405dc
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name baichuan-2-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+.. note::
+
+   Not supported on macOS.
+
+
+Model Spec 2 (pytorch, 13 Billion)
++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** baichuan-inc/Baichuan2-13B-Chat
+- **Model Revision:** a56c793eb7a721ab6c270f779024e0375e8afd4a
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name baichuan-2-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
+.. note::
+
+   Not supported on macOS.
diff --git a/doc/source/models/builtin/baichuan-2.rst b/doc/source/models/builtin/baichuan-2.rst
new file mode 100644
index 0000000000..bfb08a6949
--- /dev/null
+++ b/doc/source/models/builtin/baichuan-2.rst
@@ -0,0 +1,48 @@
+.. _models_builtin_baichuan_2:
+
+==========
+Baichuan-2
+==========
+
+- **Context Length:** 4096
+- **Model Name:** baichuan-2
+- **Languages:** en, zh
+- **Abilities:** embed, generate
+- **Description:** Baichuan2 is an open-source Transformer based LLM that is trained on both Chinese and English data.
+
+Specifications
+^^^^^^^^^^^^^^
+
+Model Spec 1 (pytorch, 7 Billion)
++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** baichuan-inc/Baichuan2-7B-Base
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name baichuan-2 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+.. note::
+
+   Not supported on macOS.
+
+Model Spec 2 (pytorch, 13 Billion)
++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** baichuan-inc/Baichuan2-13B-Base
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name baichuan-2 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
+.. note::
+
+   Not supported on macOS.
diff --git a/doc/source/models/builtin/index.rst b/doc/source/models/builtin/index.rst
index 4244b10f28..a002025037 100644
--- a/doc/source/models/builtin/index.rst
+++ b/doc/source/models/builtin/index.rst
@@ -11,6 +11,7 @@ Text Generation Models
 ++++++++++++++++++++++
 
 - :ref:`Baichuan <models_builtin_baichuan>`
+- :ref:`Baichuan <models_builtin_baichuan_2>`
 - :ref:`Falcon <models_builtin_falcon>`
 - :ref:`InternLM <models_builtin_internlm>`
 - :ref:`Llama-2 <models_builtin_llama_2>`
@@ -21,6 +22,7 @@ Chat & Instruction-following Models
 +++++++++++++++++++++++++++++++++++
 
 - :ref:`Baichuan Chat <models_builtin_baichuan_chat>`
+- :ref:`Baichuan-2 Chat <models_builtin_baichuan_2_chat>`
 - :ref:`ChatGLM <models_builtin_chatglm>`
 - :ref:`ChatGLM2 <models_builtin_chatglm2>`
 - :ref:`ChatGLM2-32k <models_builtin_chatglm2_32k>`
@@ -57,7 +59,9 @@ Code Assistant Models
    :hidden:
 
    baichuan-chat
+   baichuan-2-chat
    baichuan
+   baichuan-2
    chatglm
    chatglm2-32k
    chatglm2
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 61f90fdac1..2095bfb09f 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -1376,6 +1376,7 @@
     "version": 1,
     "context_length": 100000,
     "model_name": "code-llama-instruct",
+    "model_description": "Code-Llama-Instruct is an instruct-tuned version of the Code-Llama LLM.",
     "model_lang": [
       "en"
     ],
@@ -1430,5 +1431,96 @@
         2
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "baichuan-2-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "embed",
+      "generate",
+      "chat"
+    ],
+    "model_description": "Baichuan2-chat is a fine-tuned version of the Baichuan LLM, specializing in chatting.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "baichuan-inc/Baichuan2-7B-Chat",
+        "model_revision": "2ce891951e000c36c65442608a0b95fd09b405dc"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "baichuan-inc/Baichuan2-13B-Chat",
+        "model_revision": "a56c793eb7a721ab6c270f779024e0375e8afd4a"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "NO_COLON_TWO",
+      "system_prompt": "",
+      "roles": [
+        "<reserved_106>",
+        "<reserved_107>"
+      ],
+      "intra_message_sep": "",
+      "inter_message_sep": "</s>",
+      "stop_token_ids": [
+        2,
+        195
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "baichuan-2",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "embed",
+      "generate"
+    ],
+    "model_description": "Baichuan2 is an open-source Transformer based LLM that is trained on both Chinese and English data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "baichuan-inc/Baichuan2-7B-Base",
+        "model_revision": "f2cc3a689c5eba7dc7fd3757d0175d312d167604"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "baichuan-inc/Baichuan2-13B-Base",
+        "model_revision": "fa88072fee36e36282287410e00897df2f59e09b"
+      }
+    ]
   }
 ]
diff --git a/xinference/model/llm/pytorch/baichuan.py b/xinference/model/llm/pytorch/baichuan.py
index 6484a1e19d..8d78f60b0c 100644
--- a/xinference/model/llm/pytorch/baichuan.py
+++ b/xinference/model/llm/pytorch/baichuan.py
@@ -69,7 +69,7 @@ def _load_model(self, kwargs: dict):
     def match(cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1") -> bool:
         if llm_spec.model_format != "pytorch":
             return False
-        if "baichuan-chat" not in llm_family.model_name:
+        if llm_family.model_name not in ["baichuan-chat", "baichuan-2-chat"]:
             return False
         if "chat" not in llm_family.model_ability:
             return False
diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py
index 17be2d4530..668f8fa66d 100644
--- a/xinference/model/llm/pytorch/core.py
+++ b/xinference/model/llm/pytorch/core.py
@@ -438,6 +438,7 @@ def match(cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1") -> bool:
             return False
         if llm_family.model_name in [
             "baichuan-chat",
+            "baichuan-2-chat",
             "vicuna-v1.3",
             "falcon",
             "falcon-instruct",