FEAT: Support LMDeploy for internvl2 and fix finish reasion miss at i…

…nternvl stream (xorbitsai#2145) Co-authored-by: wuzhaoxin <[email protected]>
amumu96 · Aug 23, 2024 · b500224 · b500224
1 parent 16d1193
commit b500224
Show file tree

Hide file tree

Showing 12 changed files with 629 additions and 23 deletions.
diff --git a/xinference/core/model.py b/xinference/core/model.py
@@ -177,6 +177,7 @@ def __init__(
         request_limits: Optional[int] = None,
     ):
         super().__init__()
+        from ..model.llm.lmdeploy.core import LMDeployModel
         from ..model.llm.sglang.core import SGLANGModel
         from ..model.llm.transformers.core import PytorchModel
         from ..model.llm.vllm.core import VLLMModel
@@ -192,7 +193,9 @@ def __init__(
         self._current_generator = lambda: None
         self._lock = (
             None
-            if isinstance(self._model, (PytorchModel, VLLMModel, SGLANGModel))
+            if isinstance(
+                self._model, (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel)
+            )
             else asyncio.locks.Lock()
         )
         self._worker_ref = None

diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile
@@ -30,6 +30,8 @@ RUN pip install --upgrade -i "$PIP_INDEX" pip && \
     pip install "llama-cpp-python>=0.2.82" -i https://abetlen.github.io/llama-cpp-python/whl/cu124 && \
     pip install -i "$PIP_INDEX" --upgrade-strategy only-if-needed -r /opt/inference/xinference/deploy/docker/requirements.txt && \
     pip install -i "$PIP_INDEX" --no-deps sglang && \
+    pip uninstall flashinfer -y && \
+    pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4 && \
     cd /opt/inference && \
     python3 setup.py build_web && \
     git restore . && \

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -34,6 +34,7 @@
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
+    LMDEPLOY_CLASSES,
     MLX_CLASSES,
     SGLANG_CLASSES,
     SUPPORTED_ENGINES,
@@ -113,6 +114,7 @@ def generate_engine_config_by_model_family(model_family):
 
 def _install():
     from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
+    from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
     from .transformers.chatglm import ChatglmPytorchChatModel
@@ -148,6 +150,7 @@ def _install():
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel])
+    LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
     TRANSFORMERS_CLASSES.extend(
         [
             ChatglmPytorchChatModel,
@@ -176,6 +179,7 @@ def _install():
     SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
     SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
     SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
+    SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
 
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -7189,15 +7189,6 @@
           "model_id": "OpenGVLab/InternVL2-4B",
           "model_revision": "b50544dafada6c41e80bfde2f57cc9b0140fc21c"
         },
-        {
-          "model_format": "awq",
-          "model_size_in_billions": 4,
-          "quantizations": [
-            "Int4"
-          ],
-          "model_id": "OpenGVLab/InternVL2-8B-AWQ",
-          "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
-        },
         {
           "model_format": "pytorch",
           "model_size_in_billions": 8,
@@ -7209,6 +7200,15 @@
           "model_id": "OpenGVLab/InternVL2-8B",
           "model_revision": "3bfd3664dea4f3da628785f5125d30f889701253"
         },
+        {
+          "model_format": "awq",
+          "model_size_in_billions": 8,
+          "quantizations": [
+            "Int4"
+          ],
+          "model_id": "OpenGVLab/InternVL2-8B-AWQ",
+          "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
+        },
         {
           "model_format": "pytorch",
           "model_size_in_billions": 26,

diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
@@ -271,6 +271,8 @@ def parse_raw(
 
 MLX_CLASSES: List[Type[LLM]] = []
 
+LMDEPLOY_CLASSES: List[Type[LLM]] = []
+
 LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
 SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
 

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -4778,10 +4778,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 2,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-2B-AWQ",
@@ -4812,10 +4812,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 8,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-8B-AWQ",
@@ -4834,10 +4834,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 26,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-26B-AWQ",
@@ -4856,10 +4856,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 40,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-40B-AWQ",
@@ -4878,10 +4878,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 76,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",

diff --git a/xinference/model/llm/lmdeploy/__init__.py b/xinference/model/llm/lmdeploy/__init__.py