From 5b3740091fca39458cd9fe5ae18effd9f3700188 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Fri, 5 Jan 2024 12:35:52 +0100
Subject: [PATCH 1/3] Add Phi-2.

---
 spacy_llm/models/hf/__init__.py     |   2 +
 spacy_llm/models/hf/phi2.py         | 115 ++++++++++++++++++++++++++++
 spacy_llm/tests/models/test_phi2.py |  85 ++++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 spacy_llm/models/hf/phi2.py
 create mode 100644 spacy_llm/tests/models/test_phi2.py

diff --git a/spacy_llm/models/hf/__init__.py b/spacy_llm/models/hf/__init__.py
index b3afbb71..f7414101 100644
--- a/spacy_llm/models/hf/__init__.py
+++ b/spacy_llm/models/hf/__init__.py
@@ -4,6 +4,7 @@
 from .llama2 import llama2_hf
 from .mistral import mistral_hf
 from .openllama import openllama_hf
+from .phi2 import phi2_hf
 from .stablelm import stablelm_hf
 
 __all__ = [
@@ -13,5 +14,6 @@
     "llama2_hf",
     "mistral_hf",
     "openllama_hf",
+    "phi2_hf",
     "stablelm_hf",
 ]
diff --git a/spacy_llm/models/hf/phi2.py b/spacy_llm/models/hf/phi2.py
new file mode 100644
index 00000000..be195347
--- /dev/null
+++ b/spacy_llm/models/hf/phi2.py
@@ -0,0 +1,115 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+from confection import SimpleFrozenDict
+
+from ...compat import Literal, transformers
+from ...registry.util import registry
+from .base import HuggingFace
+
+
+class Phi2(HuggingFace):
+    MODEL_NAMES = Literal["phi-2"]  # noqa: F722
+
+    def __init__(
+        self,
+        name: str,
+        config_init: Optional[Dict[str, Any]],
+        config_run: Optional[Dict[str, Any]],
+        context_length: Optional[int],
+    ):
+        self._tokenizer: Optional["transformers.AutoTokenizer"] = None
+        super().__init__(
+            name=name,
+            config_init=config_init,
+            config_run=config_run,
+            context_length=context_length,
+        )
+
+    def init_model(self) -> "transformers.AutoModelForCausalLM":
+        """Sets up HF model and needed utilities.
+        RETURNS (Any): HF model.
+        """
+        # Initialize tokenizer and model.
+        self._tokenizer = transformers.AutoTokenizer.from_pretrained(
+            self._name, trust_remote_code=True
+        )
+        init_cfg = self._config_init
+        device: Optional[str] = None
+        if "device" in init_cfg:
+            device = init_cfg.pop("device")
+
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            self._name, **init_cfg
+        )
+        if device:
+            model.to(device)
+
+        return model
+
+    def __call__(self, prompts: Iterable[Iterable[str]]) -> Iterable[Iterable[str]]:  # type: ignore[override]
+        assert callable(self._tokenizer)
+        responses: List[List[str]] = []
+
+        for prompts_for_doc in prompts:
+            tokenized_input_ids = [
+                self._tokenizer(
+                    prompt, return_tensors="pt", return_attention_mask=False
+                ).input_ids
+                for prompt in prompts_for_doc
+            ]
+            tokenized_input_ids = [
+                tii.to(self._model.device) for tii in tokenized_input_ids
+            ]
+
+            assert hasattr(self._model, "generate")
+            responses.append(
+                [
+                    self._tokenizer.decode(
+                        self._model.generate(input_ids=tii, **self._config_run)[
+                            :, tii.shape[1] :
+                        ][0],
+                    )
+                    for tii in tokenized_input_ids
+                ]
+            )
+
+        return responses
+
+    @property
+    def hf_account(self) -> str:
+        return "microsoft"
+
+    @staticmethod
+    def compile_default_configs() -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        # See https://huggingface.co/microsoft/phi-2#sample-code for recommended setting combinations.
+        default_cfg_init, default_cfg_run = HuggingFace.compile_default_configs()
+        return (
+            {
+                **default_cfg_init,
+                "torch_dtype": "auto",
+                "device_map": "cuda",
+                "trust_remote_code": True,
+            },
+            {
+                **default_cfg_run,
+                "max_new_tokens": 200,
+            },
+        )
+
+
+@registry.llm_models("spacy.Phi-2.v1")
+def phi2_hf(
+    name: Phi2.MODEL_NAMES,
+    config_init: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+    config_run: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+) -> Callable[[Iterable[Iterable[str]]], Iterable[Iterable[str]]]:
+    """Generates OpenLLaMA instance that can execute a set of prompts and return the raw responses.
+    name (Literal): Name of the OpenLLaMA model. Has to be one of OpenLLaMA.get_model_names().
+    config_init (Optional[Dict[str, Any]]): HF config for initializing the model.
+    config_run (Optional[Dict[str, Any]]): HF config for running the model.
+    RETURNS (Callable[[Iterable[str]], Iterable[str]]): OpenLLaMA instance that can execute a set of prompts and return
+        the raw responses.
+    """
+    return Phi2(
+        name=name, config_init=config_init, config_run=config_run, context_length=2048
+    )
diff --git a/spacy_llm/tests/models/test_phi2.py b/spacy_llm/tests/models/test_phi2.py
new file mode 100644
index 00000000..771b0462
--- /dev/null
+++ b/spacy_llm/tests/models/test_phi2.py
@@ -0,0 +1,85 @@
+import copy
+
+import pytest
+import spacy
+from confection import Config  # type: ignore[import]
+from thinc.compat import has_torch_cuda_gpu
+
+from ...compat import torch
+
+_PIPE_CFG = {
+    "model": {
+        "@llm_models": "spacy.Phi-2.v1",
+        "name": "phi-2",
+    },
+    "task": {"@llm_tasks": "spacy.NoOp.v1"},
+    "save_io": True,
+}
+
+_NLP_CONFIG = """
+[nlp]
+lang = "en"
+pipeline = ["llm"]
+batch_size = 128
+
+[components]
+
+[components.llm]
+factory = "llm"
+save_io = True
+
+[components.llm.task]
+@llm_tasks = "spacy.NoOp.v1"
+
+[components.llm.model]
+@llm_models = spacy.Phi-2.v1
+name = phi-2
+"""
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+def test_init():
+    """Test initialization and simple run."""
+    nlp = spacy.blank("en")
+    nlp.add_pipe("llm", config=_PIPE_CFG)
+    doc = nlp("This is a test.")
+    torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"][0].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"][0]
+    )
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+def test_init_with_set_config():
+    """Test initialization and simple run with changed config."""
+    nlp = spacy.blank("en")
+    cfg = copy.deepcopy(_PIPE_CFG)
+    cfg["model"]["config_run"] = {"max_new_tokens": 32}
+    nlp.add_pipe("llm", config=cfg)
+    doc = nlp("This is a test.")
+    torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"][0].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"][0]
+    )
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+def test_init_from_config():
+    orig_config = Config().from_str(_NLP_CONFIG)
+    nlp = spacy.util.load_model_from_config(orig_config, auto_fill=True)
+    assert nlp.pipe_names == ["llm"]
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+def test_invalid_model():
+    orig_config = Config().from_str(_NLP_CONFIG)
+    config = copy.deepcopy(orig_config)
+    config["components"]["llm"]["model"]["name"] = "anything-else"
+    with pytest.raises(ValueError, match="unexpected value; permitted"):
+        spacy.util.load_model_from_config(config, auto_fill=True)
+    torch.cuda.empty_cache()

From 592f3966c2a3474925e5f1f52019e86a9c9f8fda Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Sat, 6 Jan 2024 11:20:28 +0100
Subject: [PATCH 2/3] Add Mixtral.

---
 spacy_llm/models/hf/__init__.py        |   2 +
 spacy_llm/models/hf/base.py            |   8 ++
 spacy_llm/models/hf/mistral.py         |   7 +-
 spacy_llm/models/hf/mixtral.py         | 108 +++++++++++++++++++++++++
 spacy_llm/tests/models/test_mixtral.py |  70 ++++++++++++++++
 5 files changed, 191 insertions(+), 4 deletions(-)
 create mode 100644 spacy_llm/models/hf/mixtral.py
 create mode 100644 spacy_llm/tests/models/test_mixtral.py

diff --git a/spacy_llm/models/hf/__init__.py b/spacy_llm/models/hf/__init__.py
index f7414101..bc268528 100644
--- a/spacy_llm/models/hf/__init__.py
+++ b/spacy_llm/models/hf/__init__.py
@@ -3,6 +3,7 @@
 from .falcon import falcon_hf
 from .llama2 import llama2_hf
 from .mistral import mistral_hf
+from .mixtral import mixtral_hf
 from .openllama import openllama_hf
 from .phi2 import phi2_hf
 from .stablelm import stablelm_hf
@@ -13,6 +14,7 @@
     "falcon_hf",
     "llama2_hf",
     "mistral_hf",
+    "mixtral_hf",
     "openllama_hf",
     "phi2_hf",
     "stablelm_hf",
diff --git a/spacy_llm/models/hf/base.py b/spacy_llm/models/hf/base.py
index b8f8b7b7..7232321c 100644
--- a/spacy_llm/models/hf/base.py
+++ b/spacy_llm/models/hf/base.py
@@ -69,6 +69,14 @@ def __init__(
                     f"Double-check you specified a valid dtype."
                 ) from ex
 
+        # Recognize boolean attributes.
+        for key, value in self._config_init.items():
+            if value in ("True", "False"):
+                self._config_init[key] = False if value == "False" else True
+        for key, value in self._config_run.items():
+            if value in ("True", "False"):
+                self._config_run[key] = False if value == "False" else True
+
         # Init HF model.
         HuggingFace.check_installation()
         self._check_model()
diff --git a/spacy_llm/models/hf/mistral.py b/spacy_llm/models/hf/mistral.py
index 3c5039a2..53883110 100644
--- a/spacy_llm/models/hf/mistral.py
+++ b/spacy_llm/models/hf/mistral.py
@@ -65,7 +65,7 @@ def __call__(self, prompts: Iterable[Iterable[str]]) -> Iterable[Iterable[str]]:
 
             tokenized_input_ids = [
                 self._tokenizer(
-                    prompt if not self._is_instruct else f"<s>[INST] {prompt} [/INST]",
+                    prompt if not self._is_instruct else f"[INST] {prompt} [/INST]",
                     return_tensors="pt",
                 ).input_ids
                 for prompt in prompts_for_doc
@@ -96,11 +96,10 @@ def mistral_hf(
     config_run: Optional[Dict[str, Any]] = SimpleFrozenDict(),
 ) -> Callable[[Iterable[Iterable[str]]], Iterable[Iterable[str]]]:
     """Generates Mistral instance that can execute a set of prompts and return the raw responses.
-    name (Literal): Name of the Falcon model. Has to be one of Falcon.get_model_names().
+    name (Literal): Name of the Mistral model. Has to be one of Falcon.get_model_names().
     config_init (Optional[Dict[str, Any]]): HF config for initializing the model.
     config_run (Optional[Dict[str, Any]]): HF config for running the model.
-    RETURNS (Callable[[Iterable[str]], Iterable[str]]): Falcon instance that can execute a set of prompts and return
-        the raw responses.
+    RETURNS (Mistral): Mistral instance that can execute a set of prompts and return the raw responses.
     """
     return Mistral(
         name=name, config_init=config_init, config_run=config_run, context_length=8000
diff --git a/spacy_llm/models/hf/mixtral.py b/spacy_llm/models/hf/mixtral.py
new file mode 100644
index 00000000..773c368d
--- /dev/null
+++ b/spacy_llm/models/hf/mixtral.py
@@ -0,0 +1,108 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional
+
+from confection import SimpleFrozenDict
+
+from ...compat import Literal, transformers
+from ...registry.util import registry
+from .base import HuggingFace
+
+
+class Mixtral(HuggingFace):
+    MODEL_NAMES = Literal[
+        "Mixtral-8x7B-v0.1", "Mixtral-8x7B-Instruct-v0.1"
+    ]  # noqa: F722
+
+    def __init__(
+        self,
+        name: MODEL_NAMES,
+        config_init: Optional[Dict[str, Any]],
+        config_run: Optional[Dict[str, Any]],
+        context_length: Optional[int],
+    ):
+        self._tokenizer: Optional["transformers.AutoTokenizer"] = None
+        self._is_instruct = "instruct" in name
+        super().__init__(
+            name=name,
+            config_init=config_init,
+            config_run=config_run,
+            context_length=context_length,
+        )
+
+        assert isinstance(self._tokenizer, transformers.PreTrainedTokenizerBase)
+
+        # Instantiate GenerationConfig object from config dict.
+        self._hf_config_run = transformers.GenerationConfig.from_pretrained(
+            self._name, **self._config_run
+        )
+        # To avoid deprecation warning regarding usage of `max_length`.
+        self._hf_config_run.max_new_tokens = self._hf_config_run.max_length
+
+    def init_model(self) -> Any:
+        self._tokenizer = transformers.AutoTokenizer.from_pretrained(self._name)
+        init_cfg = self._config_init
+        device: Optional[str] = None
+        if "device" in init_cfg:
+            device = init_cfg.pop("device")
+
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            self._name, **init_cfg, resume_download=True
+        )
+        if device:
+            model.to(device)
+
+        return model
+
+    @property
+    def hf_account(self) -> str:
+        return "mistralai"
+
+    def __call__(self, prompts: Iterable[Iterable[str]]) -> Iterable[Iterable[str]]:  # type: ignore[override]
+        assert callable(self._tokenizer)
+        assert hasattr(self._model, "generate")
+        assert hasattr(self._tokenizer, "batch_decode")
+        responses: List[List[str]] = []
+
+        for prompts_for_doc in prompts:
+            prompts_for_doc = list(prompts_for_doc)
+
+            tokenized_input_ids = [
+                self._tokenizer(
+                    prompt if not self._is_instruct else f"[INST] {prompt} [/INST]",
+                    return_tensors="pt",
+                ).input_ids
+                for prompt in prompts_for_doc
+            ]
+            tokenized_input_ids = [
+                tp.to(self._model.device) for tp in tokenized_input_ids
+            ]
+
+            responses.append(
+                [
+                    self._tokenizer.decode(
+                        self._model.generate(
+                            input_ids=tok_ii, generation_config=self._hf_config_run
+                        )[:, tok_ii.shape[1] :][0],
+                        skip_special_tokens=True,
+                    )
+                    for tok_ii in tokenized_input_ids
+                ]
+            )
+
+        return responses
+
+
+@registry.llm_models("spacy.Mixtral.v1")
+def mixtral_hf(
+    name: Mixtral.MODEL_NAMES,
+    config_init: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+    config_run: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+) -> Callable[[Iterable[Iterable[str]]], Iterable[Iterable[str]]]:
+    """Generates Mixtral instance that can execute a set of prompts and return the raw responses.
+    name (Literal): Name of the Mixtral model. Has to be one of Mixtral.get_model_names().
+    config_init (Optional[Dict[str, Any]]): HF config for initializing the model.
+    config_run (Optional[Dict[str, Any]]): HF config for running the model.
+    RETURNS (Mixtral): Mixtral instance that can execute a set of prompts and return the raw responses.
+    """
+    return Mixtral(
+        name=name, config_init=config_init, config_run=config_run, context_length=8000
+    )
diff --git a/spacy_llm/tests/models/test_mixtral.py b/spacy_llm/tests/models/test_mixtral.py
new file mode 100644
index 00000000..0dafad96
--- /dev/null
+++ b/spacy_llm/tests/models/test_mixtral.py
@@ -0,0 +1,70 @@
+import copy
+
+import pytest
+import spacy
+from confection import Config  # type: ignore[import]
+from thinc.compat import has_torch_cuda_gpu
+
+from ...compat import torch
+
+_PIPE_CFG = {
+    "model": {
+        "@llm_models": "spacy.Mixtral.v1",
+        "name": "Mixtral-8x7B-Instruct-v0.1",
+    },
+    "task": {"@llm_tasks": "spacy.NoOp.v1"},
+}
+
+_NLP_CONFIG = """
+
+[nlp]
+lang = "en"
+pipeline = ["llm"]
+batch_size = 128
+
+[components]
+
+[components.llm]
+factory = "llm"
+
+[components.llm.task]
+@llm_tasks = "spacy.NoOp.v1"
+
+[components.llm.model]
+@llm_models = "spacy.Mixtral.v1"
+name = "Mixtral-8x7B-Instruct-v0.1.1"
+"""
+
+
+@pytest.mark.gpu
+@pytest.mark.skip(reason="CI runner needs more GPU memory")
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+def test_init():
+    """Test initialization and simple run."""
+    nlp = spacy.blank("en")
+    cfg = copy.deepcopy(_PIPE_CFG)
+    nlp.add_pipe("llm", config=cfg)
+    nlp("This is a test.")
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.gpu
+@pytest.mark.skip(reason="CI runner needs more GPU memory")
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+def test_init_from_config():
+    orig_config = Config().from_str(_NLP_CONFIG)
+    nlp = spacy.util.load_model_from_config(orig_config, auto_fill=True)
+    assert nlp.pipe_names == ["llm"]
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.gpu
+@pytest.mark.skip(reason="CI runner needs more GPU memory")
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+def test_invalid_model():
+    orig_config = Config().from_str(_NLP_CONFIG)
+    config = copy.deepcopy(orig_config)
+    config["components"]["llm"]["model"]["name"] = "x"
+    with pytest.raises(ValueError, match="unexpected value; permitted"):
+        spacy.util.load_model_from_config(config, auto_fill=True)
+    torch.cuda.empty_cache()

From e4d7aa508556cf9b30971bfad388cc6c73fcdbd1 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 7 Feb 2024 09:52:06 +0100
Subject: [PATCH 3/3] Remove Mixtral.

---
 spacy_llm/models/hf/__init__.py        |   2 -
 spacy_llm/models/hf/mixtral.py         | 108 -------------------------
 spacy_llm/tests/models/test_mixtral.py |  70 ----------------
 3 files changed, 180 deletions(-)
 delete mode 100644 spacy_llm/models/hf/mixtral.py
 delete mode 100644 spacy_llm/tests/models/test_mixtral.py

diff --git a/spacy_llm/models/hf/__init__.py b/spacy_llm/models/hf/__init__.py
index bc268528..f7414101 100644
--- a/spacy_llm/models/hf/__init__.py
+++ b/spacy_llm/models/hf/__init__.py
@@ -3,7 +3,6 @@
 from .falcon import falcon_hf
 from .llama2 import llama2_hf
 from .mistral import mistral_hf
-from .mixtral import mixtral_hf
 from .openllama import openllama_hf
 from .phi2 import phi2_hf
 from .stablelm import stablelm_hf
@@ -14,7 +13,6 @@
     "falcon_hf",
     "llama2_hf",
     "mistral_hf",
-    "mixtral_hf",
     "openllama_hf",
     "phi2_hf",
     "stablelm_hf",
diff --git a/spacy_llm/models/hf/mixtral.py b/spacy_llm/models/hf/mixtral.py
deleted file mode 100644
index 773c368d..00000000
--- a/spacy_llm/models/hf/mixtral.py
+++ /dev/null
@@ -1,108 +0,0 @@
-from typing import Any, Callable, Dict, Iterable, List, Optional
-
-from confection import SimpleFrozenDict
-
-from ...compat import Literal, transformers
-from ...registry.util import registry
-from .base import HuggingFace
-
-
-class Mixtral(HuggingFace):
-    MODEL_NAMES = Literal[
-        "Mixtral-8x7B-v0.1", "Mixtral-8x7B-Instruct-v0.1"
-    ]  # noqa: F722
-
-    def __init__(
-        self,
-        name: MODEL_NAMES,
-        config_init: Optional[Dict[str, Any]],
-        config_run: Optional[Dict[str, Any]],
-        context_length: Optional[int],
-    ):
-        self._tokenizer: Optional["transformers.AutoTokenizer"] = None
-        self._is_instruct = "instruct" in name
-        super().__init__(
-            name=name,
-            config_init=config_init,
-            config_run=config_run,
-            context_length=context_length,
-        )
-
-        assert isinstance(self._tokenizer, transformers.PreTrainedTokenizerBase)
-
-        # Instantiate GenerationConfig object from config dict.
-        self._hf_config_run = transformers.GenerationConfig.from_pretrained(
-            self._name, **self._config_run
-        )
-        # To avoid deprecation warning regarding usage of `max_length`.
-        self._hf_config_run.max_new_tokens = self._hf_config_run.max_length
-
-    def init_model(self) -> Any:
-        self._tokenizer = transformers.AutoTokenizer.from_pretrained(self._name)
-        init_cfg = self._config_init
-        device: Optional[str] = None
-        if "device" in init_cfg:
-            device = init_cfg.pop("device")
-
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            self._name, **init_cfg, resume_download=True
-        )
-        if device:
-            model.to(device)
-
-        return model
-
-    @property
-    def hf_account(self) -> str:
-        return "mistralai"
-
-    def __call__(self, prompts: Iterable[Iterable[str]]) -> Iterable[Iterable[str]]:  # type: ignore[override]
-        assert callable(self._tokenizer)
-        assert hasattr(self._model, "generate")
-        assert hasattr(self._tokenizer, "batch_decode")
-        responses: List[List[str]] = []
-
-        for prompts_for_doc in prompts:
-            prompts_for_doc = list(prompts_for_doc)
-
-            tokenized_input_ids = [
-                self._tokenizer(
-                    prompt if not self._is_instruct else f"[INST] {prompt} [/INST]",
-                    return_tensors="pt",
-                ).input_ids
-                for prompt in prompts_for_doc
-            ]
-            tokenized_input_ids = [
-                tp.to(self._model.device) for tp in tokenized_input_ids
-            ]
-
-            responses.append(
-                [
-                    self._tokenizer.decode(
-                        self._model.generate(
-                            input_ids=tok_ii, generation_config=self._hf_config_run
-                        )[:, tok_ii.shape[1] :][0],
-                        skip_special_tokens=True,
-                    )
-                    for tok_ii in tokenized_input_ids
-                ]
-            )
-
-        return responses
-
-
-@registry.llm_models("spacy.Mixtral.v1")
-def mixtral_hf(
-    name: Mixtral.MODEL_NAMES,
-    config_init: Optional[Dict[str, Any]] = SimpleFrozenDict(),
-    config_run: Optional[Dict[str, Any]] = SimpleFrozenDict(),
-) -> Callable[[Iterable[Iterable[str]]], Iterable[Iterable[str]]]:
-    """Generates Mixtral instance that can execute a set of prompts and return the raw responses.
-    name (Literal): Name of the Mixtral model. Has to be one of Mixtral.get_model_names().
-    config_init (Optional[Dict[str, Any]]): HF config for initializing the model.
-    config_run (Optional[Dict[str, Any]]): HF config for running the model.
-    RETURNS (Mixtral): Mixtral instance that can execute a set of prompts and return the raw responses.
-    """
-    return Mixtral(
-        name=name, config_init=config_init, config_run=config_run, context_length=8000
-    )
diff --git a/spacy_llm/tests/models/test_mixtral.py b/spacy_llm/tests/models/test_mixtral.py
deleted file mode 100644
index 0dafad96..00000000
--- a/spacy_llm/tests/models/test_mixtral.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import copy
-
-import pytest
-import spacy
-from confection import Config  # type: ignore[import]
-from thinc.compat import has_torch_cuda_gpu
-
-from ...compat import torch
-
-_PIPE_CFG = {
-    "model": {
-        "@llm_models": "spacy.Mixtral.v1",
-        "name": "Mixtral-8x7B-Instruct-v0.1",
-    },
-    "task": {"@llm_tasks": "spacy.NoOp.v1"},
-}
-
-_NLP_CONFIG = """
-
-[nlp]
-lang = "en"
-pipeline = ["llm"]
-batch_size = 128
-
-[components]
-
-[components.llm]
-factory = "llm"
-
-[components.llm.task]
-@llm_tasks = "spacy.NoOp.v1"
-
-[components.llm.model]
-@llm_models = "spacy.Mixtral.v1"
-name = "Mixtral-8x7B-Instruct-v0.1.1"
-"""
-
-
-@pytest.mark.gpu
-@pytest.mark.skip(reason="CI runner needs more GPU memory")
-@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
-def test_init():
-    """Test initialization and simple run."""
-    nlp = spacy.blank("en")
-    cfg = copy.deepcopy(_PIPE_CFG)
-    nlp.add_pipe("llm", config=cfg)
-    nlp("This is a test.")
-    torch.cuda.empty_cache()
-
-
-@pytest.mark.gpu
-@pytest.mark.skip(reason="CI runner needs more GPU memory")
-@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
-def test_init_from_config():
-    orig_config = Config().from_str(_NLP_CONFIG)
-    nlp = spacy.util.load_model_from_config(orig_config, auto_fill=True)
-    assert nlp.pipe_names == ["llm"]
-    torch.cuda.empty_cache()
-
-
-@pytest.mark.gpu
-@pytest.mark.skip(reason="CI runner needs more GPU memory")
-@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
-def test_invalid_model():
-    orig_config = Config().from_str(_NLP_CONFIG)
-    config = copy.deepcopy(orig_config)
-    config["components"]["llm"]["model"]["name"] = "x"
-    with pytest.raises(ValueError, match="unexpected value; permitted"):
-        spacy.util.load_model_from_config(config, auto_fill=True)
-    torch.cuda.empty_cache()