Skip to content

Commit

Permalink
Update to OpenVINO 2023.3, stateful model support
Browse files Browse the repository at this point in the history
  • Loading branch information
helena-intel committed Feb 2, 2024
1 parent 16fc318 commit 6349d91
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/
# Install server
COPY proto proto
COPY server server
# RUN --mount=type=cache,target=/root/.cache/pip cd server && make gen-server && pip install ".[accelerate, openvino]"
# RUN --mount=type=cache,target=/root/.cache/pip cd server && make gen-server && pip install ".[accelerate, onnx-gpu, openvino, quantize]"
RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu, openvino, quantize]" --no-cache-dir

# Patch codegen model changes into transformers 4.35
Expand Down
32 changes: 31 additions & 1 deletion server/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions server/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ datasets = { version = "^2.15.0", optional = true }
texttable = { version = "^1.7.0", optional = true }
transformers = "4.37.1"
optimum = { version = "^1.16.2", extras = ["onnxruntime-gpu"], optional = true }
optimum-intel = { version = ">=1.14.0", extras = ["openvino,nncf"], optional = true }
onnxruntime = { version = "^1.16.3", optional = true }
onnxruntime-gpu = { version = "^1.16.3", optional = true }
onnx = { version = "^1.15.0", optional = true }
Expand All @@ -41,6 +42,7 @@ accelerate = ["accelerate"]
bnb = ["bitsandbytes"]
onnx = ["optimum", "onnxruntime", "onnx"]
onnx-gpu = ["optimum", "onnxruntime-gpu", "onnx"]
openvino = ["optimum-intel"]
# These are only required if using the quantize cli command
quantize = ["datasets", "texttable"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
dtype: torch.dtype,
quantize: Optional[str], # not used by OpenVINO
model_config: Optional[Any],
max_sequence_length: Optional[int],
) -> None:
super().__init__(model_path, model_config)
print(f"Optimum Intel version: {__version__}")
Expand All @@ -68,7 +69,7 @@ def __init__(
if ov_config_file is not None:
ov_config = json.loads(Path(ov_config_file).read_text())
else:
ov_config = {"CACHE_DIR": ""}
ov_config = {}

# Set good default options for latency-optimized workflow
if "PERFORMANCE_HINT" not in ov_config:
Expand Down
21 changes: 12 additions & 9 deletions server/text_generation_server/models/causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,16 +571,19 @@ def __init__(
else:
self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# Perform a forward pass to determine the structure of the past_key_values
one_token = torch.tensor([[1]], device=inference_engine.get_device())
_, past_key_values, _ = self.forward(input_ids=one_token, attention_mask=one_token)
if torch.is_tensor(past_key_values[0]):
self.batch_type = CombinedKVCausalLMBatch
if deployment_framework == "hf_optimum_ov" and self.model.stateful:
self.batch_type = CausalLMBatch
else:
# check the ordering of the key tensor dimensions
key_past, value_past = past_key_values[0]
keys_head_dim_last = key_past.shape[-1] == value_past.shape[-1]
self.batch_type = CausalLMBatch if keys_head_dim_last else KeysDimTransposedCausalLMBatch
# Perform a forward pass to determine the structure of the past_key_values
one_token = torch.tensor([[1]], device=inference_engine.get_device())
_, past_key_values, _ = self.forward(input_ids=one_token, attention_mask=one_token)
if torch.is_tensor(past_key_values[0]):
self.batch_type = CombinedKVCausalLMBatch
else:
# check the ordering of the key tensor dimensions
key_past, value_past = past_key_values[0]
keys_head_dim_last = key_past.shape[-1] == value_past.shape[-1]
self.batch_type = CausalLMBatch if keys_head_dim_last else KeysDimTransposedCausalLMBatch

@property
def batch_type(self) -> Type[CausalLMBatch]:
Expand Down

0 comments on commit 6349d91

Please sign in to comment.