Fixes

openvinotoolkit · Dec 25, 2024 · ae47253 · ae47253
1 parent 06eddb8
commit ae47253
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 58 deletions.
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -45,6 +45,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
  *                 Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
  *
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
+ * @param presence_penalty reduces absolute log prob if the token was generated at least once.
+ * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
+ *
  * Beam search specific parameters:
  * @param num_beams number of beams for beam search. 1 disables beam search.
  * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -61,15 +65,13 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  *        "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
  *        "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
  *
- * Random sampling parameters:
+ * Random (or multinomial) sampling parameters:
+ * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
  * @param temperature the value used to modulate token probabilities for random sampling.
  * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
- * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
- * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
- * @param presence_penalty reduces absolute log prob if the token was generated at least once.
- * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
  * @param rng_seed initializes random generator.
+ * @param num_return_sequences the number of sequences to generate from a single prompt.
  *
  * Assisting generation parameters:
  * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update.
@@ -90,7 +92,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     size_t min_new_tokens = 0;
     bool echo = false;
     size_t logprobs = 0;
-    
+
     std::set<std::string> stop_strings;
     // Default setting in vLLM (and OpenAI API) is not to include stop string in the output
     bool include_stop_str_in_output = false;

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -185,6 +185,9 @@ void GenerationConfig::validate() const {
                     "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined.");
     if (is_beam_search()) {
         OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive");
+        if (num_beam_groups > 1) {
+            OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search");
+        }
     } else {
         OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]");
         OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]");

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -361,10 +361,10 @@ class ContinuousBatchingPipeline:
     This class is used for generation with LLMs with continuous batchig
     """
     @typing.overload
-    def __init__(self, models_path: str, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
-    def __init__(self, models_path: str, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
     def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle:
@@ -522,24 +522,28 @@ class FluxTransformer2DModel:
 class GenerationConfig:
     """
     
-        Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-        and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+        Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+        and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
         be used while greedy and beam search parameters will not affect decoding at all.
     
-        Parameters: 
+        Parameters:
         max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                        max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
         max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
         ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
         eos_token_id:  token_id of <eos> (end of sentence)
-        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
         stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
         include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
         stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
         echo:           if set to true, the model will echo the prompt in the output.
         logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                         Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
     
+        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+        presence_penalty: reduces absolute log prob if the token was generated at least once.
+        frequency_penalty: reduces absolute log prob as many times as the token was generated.
+    
         Beam search specific parameters:
         num_beams:         number of beams for beam search. 1 disables beam search.
         num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -550,8 +554,8 @@ class GenerationConfig:
             length_penalty < 0.0 encourages shorter sequences.
         num_return_sequences: the number of sequences to return for grouped beam search decoding.
         no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-        stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-            "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+        stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+            "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
             "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
             "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
     
@@ -560,7 +564,7 @@ class GenerationConfig:
         top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
         top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
         do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+        num_return_sequences: the number of sequences to generate from a single prompt.
     """
     adapters: AdapterConfig | None
     assistant_confidence_threshold: float
@@ -951,24 +955,28 @@ class LLMPipeline:
             :rtype: DecodedResults, EncodedResults, str
          
          
-            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
             be used while greedy and beam search parameters will not affect decoding at all.
         
-            Parameters: 
+            Parameters:
             max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                            max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
             stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
             echo:           if set to true, the model will echo the prompt in the output.
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
         
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+            presence_penalty: reduces absolute log prob if the token was generated at least once.
+            frequency_penalty: reduces absolute log prob as many times as the token was generated.
+        
             Beam search specific parameters:
             num_beams:         number of beams for beam search. 1 disables beam search.
             num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -979,8 +987,8 @@ class LLMPipeline:
                 length_penalty < 0.0 encourages shorter sequences.
             num_return_sequences: the number of sequences to return for grouped beam search decoding.
             no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
                 "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
                 "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
         
@@ -989,7 +997,7 @@ class LLMPipeline:
             top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
             top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
             do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+            num_return_sequences: the number of sequences to generate from a single prompt.
         """
     @typing.overload
     def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None:
@@ -1032,24 +1040,28 @@ class LLMPipeline:
             :rtype: DecodedResults, EncodedResults, str
          
          
-            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
             be used while greedy and beam search parameters will not affect decoding at all.
         
-            Parameters: 
+            Parameters:
             max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                            max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
             stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
             echo:           if set to true, the model will echo the prompt in the output.
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
         
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+            presence_penalty: reduces absolute log prob if the token was generated at least once.
+            frequency_penalty: reduces absolute log prob as many times as the token was generated.
+        
             Beam search specific parameters:
             num_beams:         number of beams for beam search. 1 disables beam search.
             num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -1060,8 +1072,8 @@ class LLMPipeline:
                 length_penalty < 0.0 encourages shorter sequences.
             num_return_sequences: the number of sequences to return for grouped beam search decoding.
             no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
                 "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
                 "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
         
@@ -1070,7 +1082,7 @@ class LLMPipeline:
             top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
             top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
             do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+            num_return_sequences: the number of sequences to generate from a single prompt.
         """
     def get_generation_config(self) -> GenerationConfig:
         ...
@@ -1420,7 +1432,7 @@ class StopCriteria:
     """
     
         StopCriteria controls the stopping condition for grouped beam search.
-        
+    
         The following values are possible:
             "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates.
             "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates.