Skip to content

Commit

Permalink
Use opset15 version of Str Pack/Unpack
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Dec 12, 2024
1 parent 1da0d2c commit 49223d4
Show file tree
Hide file tree
Showing 8 changed files with 21 additions and 136 deletions.
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/build_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def build_rwkv_tokenizer(
if clean_up_tokenization_spaces:
RegexDecodingStep.clean_up_tokenization_spaces().get_ov_subgraph(detokenizer_output)

detokenizer_output = _get_factory().create("StringTensorPack", detokenizer_output).outputs()
detokenizer_output = _get_factory("opset15").create("StringTensorPack", detokenizer_output).outputs()
detokenizer_output[0].tensor.add_names({STRING_OUTPUT_NAME})

detokenizer = Model(detokenizer_output, [detokenizer_input], DETOKENIZER_NAME)
Expand Down
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ def get_sp_detokenizer(
if params.utf8_replace_mode is not None and params.utf8_replace_mode != UTF8ReplaceMode.DISABLE:
last_sinks = UTF8ValidateStep(params.utf8_replace_mode).get_ov_subgraph(detokenizer)

string_output = _get_factory().create("StringTensorPack", last_sinks).outputs()
string_output = _get_factory("opset15").create("StringTensorPack", last_sinks).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_detokenizer = Model(string_output, [model_input], DETOKENIZER_NAME)
tokenizer_detokenizer.validate_nodes_and_infer_types()
Expand Down
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1274,7 +1274,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
pipeline_step = step.get_ov_subgraph(input_nodes)
input_nodes = pipeline_step

return _get_factory().create("StringTensorPack", input_nodes).outputs()
return _get_factory("opset15").create("StringTensorPack", input_nodes).outputs()

def get_detokenizer_ov_subgraph(self) -> Model:
self.finalize()
Expand Down
33 changes: 0 additions & 33 deletions src/string_tensor_pack.cpp

This file was deleted.

16 changes: 7 additions & 9 deletions src/string_tensor_pack.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,24 @@
#pragma once

#include <openvino/op/op.hpp>
#include <openvino/op/string_tensor_pack.hpp>

// Having a decomposed representation for a tensor, converts it to a single string tensor with element::string element type.
class StringTensorPack : public ov::op::Op {
class StringTensorPack : public ov::op::v15::StringTensorPack {
public:
OPENVINO_OP("StringTensorPack");

StringTensorPack () = default;

StringTensorPack(ov::OutputVector inputs, const std::string& mode = "begins_ends")
: ov::op::Op(inputs), m_mode(mode) {
: ov::op::v15::StringTensorPack(inputs[0], inputs[1], inputs[2]), m_mode(mode) {
constructor_validate_and_infer_types();
}

void validate_and_infer_types() override;
void validate_and_infer_types() override {
OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get ", m_mode);
ov::op::v15::StringTensorPack::validate_and_infer_types();
}

std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
auto result = std::make_shared<StringTensorPack>(inputs, m_mode);
Expand All @@ -30,12 +34,6 @@ class StringTensorPack : public ov::op::Op {
return true;
}

bool has_evaluate() const override {
return true;
}

bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;

private:

std::string m_mode = "begins_ends";
Expand Down
78 changes: 0 additions & 78 deletions src/string_tensor_unpack.cpp

This file was deleted.

16 changes: 7 additions & 9 deletions src/string_tensor_unpack.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,27 @@
#pragma once

#include <openvino/op/op.hpp>
#include <openvino/op/string_tensor_unpack.hpp>

// Unpack a string tensor representation regardless of the source format, which
// can be an OpenVINO tensor with element::string element type or u8 legacy packed
// representation, to a decompose tensor representation that may potentially
// consist of multiple tensors. The destination format is defined by `mode` attribute.
class StringTensorUnpack : public ov::op::Op {
class StringTensorUnpack : public ov::op::v15::StringTensorUnpack {
public:
OPENVINO_OP("StringTensorUnpack");

StringTensorUnpack () = default;

StringTensorUnpack(ov::OutputVector inputs, const std::string& mode = "begins_ends")
: ov::op::Op(inputs), m_mode(mode) {
: ov::op::v15::StringTensorUnpack(inputs[0]), m_mode(mode) {
constructor_validate_and_infer_types();
}

void validate_and_infer_types() override;
void validate_and_infer_types() override {
OPENVINO_ASSERT(m_mode == "begins_ends", "StringTensorUnpack supporst only 'begins_ends' mode, but get ", m_mode);
ov::op::v15::StringTensorUnpack::validate_and_infer_types();
}

std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
auto result = std::make_shared<StringTensorUnpack>(inputs, m_mode);
Expand All @@ -33,12 +37,6 @@ class StringTensorUnpack : public ov::op::Op {
return true;
}

bool has_evaluate() const override {
return true;
}

bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;

private:

std::string m_mode = "begins_ends";
Expand Down
8 changes: 4 additions & 4 deletions tests/layer_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ def create_normalization_model(layer: Union[NormalizationStep, DecodingStep]) ->
input_node = op.Parameter(Type.string, PartialShape(["?"]))
input_node.set_friendly_name("string_input")

output = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
output = _get_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
output = layer.get_ov_subgraph(output)
output = _get_factory().create("StringTensorPack", output).outputs()
output = _get_factory("opset15").create("StringTensorPack", output).outputs()
normalizer = Model(output, [input_node], "normalizer")

return core.compile_model(normalizer)
Expand Down Expand Up @@ -179,10 +179,10 @@ def create_splitting_model(layer: PreTokenizatinStep) -> ov.CompiledModel:
input_node = op.Parameter(Type.string, PartialShape(["?"]))
input_node.set_friendly_name("string_input")

output = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
output = _get_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
output = TokenizerPipeline.add_ragged_dimension(output)
output = layer.get_ov_subgraph(output)
output = _get_factory().create("StringTensorPack", output[2:5]).outputs()
output = _get_factory("opset15").create("StringTensorPack", output[2:5]).outputs()
splitter = Model(output, [input_node], "splitter")

return core.compile_model(splitter)
Expand Down

0 comments on commit 49223d4

Please sign in to comment.