Merge branch 'master' into xufang/speculative_decoding_profile

openvinotoolkit · Dec 26, 2024 · d6e77d3 · d6e77d3
2 parents 9c441ec + 812163a
commit d6e77d3
Show file tree

Hide file tree

Showing 96 changed files with 3,514 additions and 2,504 deletions.
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -16,10 +16,10 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241205_x86_64.tgz
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241205_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241224_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241224_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores

diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
@@ -11,7 +11,7 @@ on:
         type: string
 
 env:
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
 
 jobs:
   visual_language_chat_sample-ubuntu-llava:

diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -18,8 +18,8 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 jobs:

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -270,7 +270,7 @@ jobs:
           - name: 'Whisper'
             cmd: 'tests/python_tests/test_whisper_generate_api.py'
           - name: 'LLM & VLM'
-            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py -k "not Qwen2-0.5B-Instruct"' # Skip failed tests Qwen2-0.5B-Instruct
+            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py'
     defaults:
       run:
         shell: bash

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
@@ -114,14 +114,14 @@ jobs:
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
         run: |
           huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
         run: |
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
         run: |
           wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4
           rm -rf ./ov_models/lcm_dreamshaper_v7/
       - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
         run: |

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: master
   OV_TARBALL: ''
 
 jobs:
@@ -225,7 +225,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: master
   OV_TARBALL: ''
 
 jobs:
@@ -236,7 +236,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 

diff --git a/README.md b/README.md
@@ -331,10 +331,14 @@ For more examples check out our [Generative AI workflow](https://docs.openvino.a
 
 NOTE: Whisper Pipeline requires preprocessing of audio input (to adjust sampling rate and normalize)
  
- ### Converting and compressing image generation model from Hugging Face library
+ ### Converting and quantizing speech-to-text model from Hugging Face library
 ```sh
 #Download and convert to OpenVINO whisper-base model
 optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base
+
+#Download, convert and apply int8 static quantization to whisper-base model
+optimum-cli export openvino --trust-remote-code --model openai/whisper-base \
+--quant-mode int8 --dataset librispeech --num-samples 32 whisper-base-int8
 ```
 
 ### Run generation using Whisper Pipeline API in Python

diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -22,14 +22,10 @@ int main(int argc, char* argv[]) try {
 
     std::string device = "CPU";
 
-    ov::genai::SchedulerConfig scheduler_config;
-    scheduler_config.cache_size = 5;
-
     ov::genai::LLMPipeline pipe(
         model_path,
         device,
-        ov::genai::prompt_lookup(true),
-        ov::genai::scheduler_config(scheduler_config));
+        ov::genai::prompt_lookup(true));
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;

diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -26,14 +26,10 @@ int main(int argc, char* argv[]) try {
     // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft.
     std::string main_device = "CPU", draft_device = "CPU";
 
-    ov::genai::SchedulerConfig scheduler_config;
-    scheduler_config.cache_size = 5;
-
     ov::genai::LLMPipeline pipe(
         main_model_path,
         main_device,
-        ov::genai::draft_model(draft_model_path, draft_device),
-        ov::genai::scheduler_config(scheduler_config));
+        ov::genai::draft_model(draft_model_path, draft_device));
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;

diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
@@ -33,6 +33,91 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```c++
+#include "openvino/genai/whisper_pipeline.hpp"
+
+ov::genai::WhisperPipeline pipeline(model_dir, "CPU");
+// Pipeline expects normalized audio with Sample Rate of 16kHz
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech);
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::language("<|en|>"));
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech, ov::genai::language("<|fr|>"));
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("fr_sample.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::task("translate"));
+//  It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```C++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::return_timestamps(true));
+
+std::cout << std::setprecision(2);
+for (auto& chunk : *result.chunks) {
+    std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
+}
+// timestamps: [0, 2] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```c++
+auto result = pipeline.generate(raw_speech);
+//  He has gone and gone for good answered Paul Icrom who...
+
+result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+//  He has gone and gone for good answered Polychrome who...
+```
+
+
 ### Troubleshooting
 
 #### Empty or rubbish output

diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -28,6 +28,7 @@ int main(int argc, char* argv[]) try {
 
     std::cout << result << "\n";
 
+    std::cout << std::setprecision(2);
     for (auto& chunk : *result.chunks) {
         std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
     }

diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
@@ -18,11 +18,8 @@ def main():
     args = parser.parse_args()
 
     device = 'CPU'
-    scheduler_config = openvino_genai.SchedulerConfig()
-    # cache params
-    scheduler_config.cache_size = 2
 
-    pipe = openvino_genai.LLMPipeline(args.model_dir, device, scheduler_config=scheduler_config, prompt_lookup=True)
+    pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True)
 
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100

diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py
@@ -25,13 +25,9 @@ def main():
     main_device = 'CPU'  # GPU can be used as well
     draft_device = 'CPU'
 
-    scheduler_config = openvino_genai.SchedulerConfig()
-    # cache params
-    scheduler_config.cache_size = 2
-
     draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
 
-    pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model)
+    pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
 
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100

diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
@@ -40,6 +40,93 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```python
+import openvino_genai
+import librosa
+
+def read_wav(filepath):
+    raw_speech, samplerate = librosa.load(filepath, sr=16000)
+    return raw_speech.tolist()
+
+pipe = openvino_genai.WhisperPipeline(model_dir, "CPU")
+# Pipeline expects normalized audio with Sample Rate of 16kHz
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```python
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+
+raw_speech = read_wav('fr_sample.wav')
+result = pipe.generate(raw_speech)
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, language="<|en|>")
+#  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, language="<|fr|>")
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```python
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, task="translate")
+# It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, return_timestamps=True)
+
+for chunk in result.chunks:
+    print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
+# timestamps: [0.00, 2.00] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```python
+result = pipe.generate(raw_speech)
+#  He has gone and gone for good answered Paul Icrom who...
+
+result = pipe.generate(raw_speech, initial_prompt="Polychrome")
+#  He has gone and gone for good answered Polychrome who...
+```
+
 ### Troubleshooting
 
 #### Empty or rubbish output

diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -18,7 +18,7 @@ def main():
     parser.add_argument("wav_file_path")
     args = parser.parse_args()
 
-    device = "CPU"  # GPU can be used as well
+    device = "CPU"  # GPU, NPU can be used as well
     pipe = openvino_genai.WhisperPipeline(args.model_dir, device)
 
     config = pipe.get_generation_config()
@@ -34,8 +34,9 @@ def main():
 
     print(result)
 
-    for chunk in result.chunks:
-        print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")
+    if result.chunks:
+        for chunk in result.chunks:
+            print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
 
 
 if "__main__" == __name__: