openvinotoolkit · ilya-lavrenov · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -225,7 +225,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -236,7 +236,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -45,6 +45,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
  *                 Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
  *
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
+ * @param presence_penalty reduces absolute log prob if the token was generated at least once.
+ * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
+ *
  * Beam search specific parameters:
  * @param num_beams number of beams for beam search. 1 disables beam search.
  * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -61,15 +65,13 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  *        "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
  *        "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
  *
- * Random sampling parameters:
+ * Random (or multinomial) sampling parameters:
+ * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
  * @param temperature the value used to modulate token probabilities for random sampling.
  * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
- * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
- * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
- * @param presence_penalty reduces absolute log prob if the token was generated at least once.
- * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
  * @param rng_seed initializes random generator.
+ * @param num_return_sequences the number of sequences to generate from a single prompt.
  *
  * Assisting generation parameters:
  * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update.
@@ -90,7 +92,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     size_t min_new_tokens = 0;
     bool echo = false;
     size_t logprobs = 0;
-    
+
     std::set<std::string> stop_strings;
     // Default setting in vLLM (and OpenAI API) is not to include stop string in the output
     bool include_stop_str_in_output = false;

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -36,9 +36,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
-     * 
-     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. 
-     * When this constructor is used bos, eos, pad token ids are expected to be in IR. 
+     *
+     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
+     * When this constructor is used bos, eos, pad token ids are expected to be in IR.
      * If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
      * @param tokenizer_model_str tokenizer model string
      * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
@@ -55,9 +55,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     );
 
     /**
-     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. 
-     * 
-     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's 
+     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
+     *
+     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's
      * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
      * are expected to be in IR. If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
      * @param model_str model string
@@ -82,7 +82,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
         ov::Tensor& detokenizer_weights_tensor,
         Properties&&... properties
         ) : Tokenizer(tokenizer_model_str, tokenizer_weights_tensor, detokenizer_model_str, detokenizer_weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-    
+
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param model_str model string
@@ -93,7 +93,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor,
               Properties&&... properties)
         : Tokenizer(model_str, weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-    
+
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
@@ -111,7 +111,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @return pair of [input_ids, attention_mask]
     */
     TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {});
-    
+
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
@@ -127,7 +127,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @param prompt std::string with input prompt
     * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
     * @return pair of [input_ids, attention_mask]
-    */    
+    */
     template <typename... Properties>
     util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::string& prompt, Properties&&... properties) {
         return encode(prompt, AnyMap{std::forward<Properties>(properties)...});
@@ -164,7 +164,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     }
 
     /**
-    * @brief decode tokens. 
+    * @brief decode tokens.
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
     * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size = batch_size
@@ -183,7 +183,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     }
 
     /**
-    * @brief batched decoding of tokens. 
+    * @brief batched decoding of tokens.
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
     * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size equal to batch_size
@@ -203,8 +203,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief Embeds input prompts with special tags for a chat scenario.
-     * 
-     * For example, for Qwen family models, the prompt "1+1=" would be transformed into 
+     *
+     * For example, for Qwen family models, the prompt "1+1=" would be transformed into
      * <|im_start|>user\n1+1=<|im_end|>\n<|im_start|>assistant\n.
      *
      * @param history A vector of maps, with chat history, e.g. [{"role": "user", "content": "prompt"}, ...].
@@ -214,7 +214,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
      * @throws Exception if the chat template was unable to parse the input history.
      */
     std::string apply_chat_template(ChatHistory history,
-                                    bool add_generation_prompt, 
+                                    bool add_generation_prompt,
                                     const std::string& chat_template = {}) const;
 
     /// @brief Override a chat_template read from tokenizer_config.json.

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -185,6 +185,9 @@ void GenerationConfig::validate() const {
                     "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined.");
     if (is_beam_search()) {
         OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive");
+        if (num_beam_groups > 1) {
+            OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search");
+        }
     } else {
         OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]");
         OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]");

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
@@ -89,15 +89,16 @@ class Tokenizer::TokenizerImpl {
 public:
     ov::CompiledModel m_tokenizer;
     ov::CompiledModel m_detokenizer;
-    
+
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
-    // To change the adding special tokens mode we use a statefull subgraph, 
+
+    // To change the adding special tokens mode we use a statefull subgraph,
     // this flag holds the current state value of the CompiledModel.
     bool m_add_special_tokens = true;
     bool m_skip_special_tokens = true;
     bool m_older_than_24_5 = false;
-    
+
     int64_t m_pad_token_id = -1;
     int64_t m_bos_token_id = -1;
     int64_t m_eos_token_id = -1;
@@ -111,6 +112,7 @@ class Tokenizer::TokenizerImpl {
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
         bool add_special_tokens_flag = m_add_special_tokens;
         bool skip_special_tokens_flag = m_skip_special_tokens;
+
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
 
@@ -126,11 +128,11 @@ class Tokenizer::TokenizerImpl {
             // state but the effect is incorrect.
             return;
         }
-        
+
         // add_special_tokens is managed by Select op with a bool input.
         ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
         *add_special_tensor.data<bool>() = add_special_tokens_flag;
-        
+
         // skip_special_tokens is managed by multiplication with a number, therefore i32.
         ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1});
         *skip_special_tensor.data<int>() = skip_special_tokens_flag;
@@ -148,32 +150,32 @@ class Tokenizer::TokenizerImpl {
 
     TokenizerImpl() = default;
 
-    TokenizerImpl(const std::filesystem::path& models_papth,  const ov::AnyMap& properties) {
-        setupTokenizer(models_papth, properties);
+    TokenizerImpl(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+        setup_tokenizer(models_path, properties);
     }
 
     TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
-        setupTokenizer(models, properties);
+        setup_tokenizer(models, properties);
     }
 
-    void setupTokenizer(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+    void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
         ScopedVar env_manager(tokenizers_relative_to_genai().string());
         auto core = get_core_singleton();
 
-        OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_papth' parameter should be a path to a dir not a xml file");
+        OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_path' parameter should be a path to a dir not a xml file");
 
         std::shared_ptr<ov::Model> ov_tokenizer = nullptr;
         std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
 
         if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) {
             ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml");
         }
-        
+
         if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) {
             ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml");
         }
 
-        setupTokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
+        setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
 
         // If special tokens were not found from IR, try to read them from config.
         // This will be triggered only for IRs older than 2024.3.
@@ -184,21 +186,20 @@ class Tokenizer::TokenizerImpl {
             // Try to read tokenizer_config if some token ids or token str are not defined.
             read_tokenizer_config_if_necessary(models_path);
         }
-        
+
         // If chat_template was not found in IR, try to read them from config.
         if (m_chat_template.empty()) {
             m_chat_template = chat_template_from_tokenizer_json_if_exists(models_path);
         }
     }
-
 
-    void setupTokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
+    void setup_tokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
         auto [ov_tokenizer, ov_detokenizer] = models;
         OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided");
 
         auto core = get_core_singleton();
         std::string device = "CPU"; // only CPU is supported for now
-        
+
         std::string version_str;
         utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
         // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
@@ -231,7 +232,7 @@ class Tokenizer::TokenizerImpl {
                     return std::move(this->m_detokenizer.create_infer_request());
                 });
         }
-        
+
         // Initialize tokenizer's cache to save time later.
         if (m_tokenizer) {
             // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
@@ -286,10 +287,11 @@ class Tokenizer::TokenizerImpl {
 
         nlohmann::json data = nlohmann::json::parse(f);
 
-        using ov::genai::utils::read_json_param;
         // they are in the format {"bos_token": { "content": "<s>",... }}
-        auto read_token_content_str = [&data](std::string key_name, std::string& val) {
-            if (val == "" && data.contains(key_name)) { read_json_param(data[key_name], "content", val); }
+        auto read_token_content_str = [&data](const std::string& key_name, std::string& val) {
+            if (val.empty() && data.contains(key_name)) {
+                utils::read_json_param(data[key_name], "content", val);
+            }
         };
         read_token_content_str(pad_token_key_name, m_pad_token);
         read_token_content_str(bos_token_key_name, m_bos_token);
@@ -494,7 +496,7 @@ class Tokenizer::TokenizerImpl {
             {"is none", "is undefined"},
             {"= none", "= undefined"},
             // Jinja2Cpp does not support Python-style slicing, e.g. [1:].
-            // If chat template contains such slicing, we replace it with 
+            // If chat template contains such slicing, we replace it with
             // a placeholder at the moment.
             {"messages[1:]", "slice(messages, 1)"},
         };
@@ -537,7 +539,7 @@ class Tokenizer::TokenizerImpl {
         env.GetSettings().trimBlocks = true;
         jinja2::Template tpl(&env);
         tpl.Load(chat_tpl);
-        
+
         jinja2::UserCallable slice_callable = jinja2::MakeCallable(
             [](const jinja2::GenericList& messages, const size_t& start) {
                 jinja2::ValuesList result;
@@ -607,7 +609,7 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c
     ScopedVar env_manager(tokenizers_relative_to_genai().string());
     auto core = get_core_singleton();
     auto model = core.read_model(model_str, weights_tensor);
-    
+
     auto parameters = model->get_parameters();
     OPENVINO_ASSERT(!parameters.empty());
     if (parameters.front()->get_element_type() == ov::element::string) {