Merge branch 'master' into use-continuos-batching-by-default

openvinotoolkit · Dec 28, 2024 · ca58d3a · ca58d3a
2 parents 17d4333 + 6c56a7b
commit ca58d3a
Show file tree

Hide file tree

Showing 50 changed files with 1,408 additions and 1,031 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -13,17 +13,20 @@
 - 'src/python/py_tokenizer.cpp'
 - 'thirdparty/openvino_tokenizers'
 - 'tests/python_tests/tokenizer_configs.py'
+- 'tests/python_tests/test_tokenizer.py'
 
 'category: LLM':
 - 'src/cpp/include/openvino/genai/llm_pipeline.hpp'
 - 'src/cpp/src/llm_pipeline.cpp'
+- 'src/cpp/src/lm_encoding.hpp'
 - 'src/cpp/src/lm_encoding.cpp'
 - 'src/cpp/src/llm_pipeline_base.hpp'
 - 'src/cpp/src/llm_pipeline_static.hpp'
 - 'src/cpp/src/llm_pipeline_static.cpp'
+- 'src/cpp/src/text_callback_streamer.cpp'
+- 'src/cpp/src/text_callback_streamer.hpp'
 - 'src/python/py_llm_pipeline.cpp'
-- 'tests/python_tests/test_generate_api.py'
-- 'tests/python_tests/test_chat_generate_api.py'
+- 'tests/python_tests/test_llm_pipeline.py'
 
 'category: sampling':
 - 'src/cpp/include/openvino/genai/generation_config.hpp'
@@ -35,6 +38,7 @@
 - 'tests/cpp/logit_filtering.cpp'
 - 'tests/cpp/generate_config.cpp'
 - 'tests/cpp/sampler.cpp'
+- 'tests/python_tests/test_sampling.py'
 
 'category: LoRA':
 - 'src/cpp/include/openvino/genai/lora_adapter.hpp'
@@ -54,9 +58,12 @@
 - 'src/cpp/include/openvino/genai/whisper_pipeline.hpp'
 - 'src/cpp/src/whisper/**/*'
 - 'src/cpp/src/whisper_generation_config.cpp'
+- 'src/cpp/src/whisper_pipeline_base.hpp'
 - 'src/cpp/src/whisper_pipeline.cpp'
+- 'src/cpp/src/whisper_pipeline_static.cpp'
+- 'src/cpp/src/whisper_pipeline_static.hpp'
 - 'src/python/py_whisper_pipeline.cpp'
-- 'tests/python_tests/test_whisper_generate_api.py'
+- 'tests/python_tests/test_whisper_pipeline.py'
 
 'category: Python API':
 - 'src/python/**/*'
@@ -65,10 +72,14 @@
 - 'src/include/openvino/genai/visual_language/**/*'
 - 'src/cpp/src/visual_language/**/*'
 - 'src/python/py_vlm_pipeline.cpp'
-- 'tests/python_tests/test_vlm_api.py'
+- 'tests/python_tests/test_vlm_pipeline.py'
 
 'category: speculative decoding':
 - 'src/cpp/src/speculative_decoding/**/*'
+- 'tests/cpp/speculative_decoding.cpp'
+
+'category: prompt lookup':
+- 'src/cpp/src/prompt_lookup/**/*'
 
 'category: continuous batching':
 - 'src/cpp/include/openvino/genai/cache_eviction.hpp'
@@ -91,19 +102,19 @@
 - 'src/cpp/src/generation_handle.cpp'
 - 'src/cpp/src/generation_stream.hpp'
 - 'src/cpp/src/model_runner.hpp'
-- 'src/cpp/src/paged_attention_transformations.cpp'
-- 'src/cpp/src/paged_attention_transformations.hpp'
+- 'src/cpp/src/utils/paged_attention_transformations.cpp'
+- 'src/cpp/src/utils/paged_attention_transformations.hpp'
 - 'src/cpp/src/scheduler.hpp'
 - 'src/cpp/src/sequence_group.cpp'
 - 'src/cpp/src/sequence_group.hpp'
 - 'src/cpp/src/timer.hpp'
 - 'src/python/py_continuous_batching_pipeline.cpp'
-- 'tests/python_tests/test_cache_optimizations.py'
-- 'tests/python_tests/test_preemption.py'
-- 'tests/python_tests/test_sampling.py'
+- 'tests/python_tests/test_continuous_batching.py'
+- 'tests/python_tests/test_kv_cache_eviction.py'
 - 'tests/cpp/block_allocator.cpp'
 - 'tests/cpp/block_hash_store.cpp'
 - 'tests/cpp/block_manager.cpp'
+- 'tests/cpp/cache_eviction.cpp'
 - 'tests/cpp/cache_manager.cpp'
 - 'tests/cpp/device_config.cpp'
 - 'tests/cpp/scheduler.cpp'

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -268,9 +268,9 @@ jobs:
       matrix:
         test:
           - name: 'Whisper'
-            cmd: 'tests/python_tests/test_whisper_generate_api.py'
+            cmd: 'tests/python_tests/test_whisper_pipeline.py'
           - name: 'LLM & VLM'
-            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py'
+            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_pipeline.py'
     defaults:
       run:
         shell: bash

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -178,7 +178,7 @@ jobs:
     if: |
       always() &&
       (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
-    timeout-minutes: 90
+    timeout-minutes: 120
     defaults:
       run:
         shell: bash
@@ -235,7 +235,7 @@ jobs:
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
           python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -290,7 +290,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -300,7 +300,7 @@ jobs:
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
           python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -244,7 +244,7 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -300,7 +300,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
@@ -309,7 +309,7 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_python_lib_vlm:
     name: OpenVINO genai VLM tests (cmake + wheel)
@@ -365,7 +365,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_vlm_api.py
+          python -m pytest -v ./tests/python_tests/test_vlm_pipeline.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,7 +51,7 @@ options = {"BUILD_TOKENIZERS" = "OFF"}
 
 [build-system]
 requires = [
-    "py-build-cmake==0.3.3",
+    "py-build-cmake==0.3.4",
     "openvino~=2025.0.0.0.dev",
     "pybind11-stubgen==2.5.1",
     "cmake~=3.23.0"

diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp
@@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try {
     config.max_new_tokens = 20;
     config.num_beam_groups = 3;
     config.num_beams = 15;
+    config.diversity_penalty = 1.0f;
     config.num_return_sequences = config.num_beams;
 
     // Since the streamer is set, the results will

diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py
@@ -19,6 +19,7 @@ def main():
     config.max_new_tokens = 20
     config.num_beam_groups = 3
     config.num_beams = 15
+    config.diversity_penalty = 1
     config.num_return_sequences = config.num_beams
 
     beams = pipe.generate(args.prompts, config)

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -93,15 +93,22 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     bool echo = false;
     size_t logprobs = 0;
 
+    // EOS special token
+    int64_t eos_token_id = -1;
     std::set<std::string> stop_strings;
     // Default setting in vLLM (and OpenAI API) is not to include stop string in the output
     bool include_stop_str_in_output = false;
     std::set<int64_t> stop_token_ids;
 
+    // penalties (not used in beam search)
+    float repetition_penalty = 1.0f;
+    float presence_penalty = 0.0;
+    float frequency_penalty = 0.0f;
+
     // Beam search specific
     size_t num_beam_groups = 1;
     size_t num_beams = 1;
-    float diversity_penalty = 1.0f;
+    float diversity_penalty = 0.0f;
     float length_penalty = 1.0f;
     size_t num_return_sequences = 1;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
@@ -112,19 +119,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     float top_p = 1.0f;
     size_t top_k = std::numeric_limits<size_t>::max();
     bool do_sample = false;
-    float repetition_penalty = 1.0f;
-    float presence_penalty = 0.0;
-    float frequency_penalty = 0.0f;
     size_t rng_seed = 0;
 
     // Assisting generation parameters
     float assistant_confidence_threshold = 0.f;
     size_t num_assistant_tokens = 0;
     size_t max_ngram_size = 0;
 
-    // EOS special token
-    int64_t eos_token_id = -1;
-
     std::optional<AdapterConfig> adapters;
 
     /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
@@ -136,11 +137,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     bool is_greedy_decoding() const;
     bool is_beam_search() const;
     bool is_multinomial() const;
-    OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release")
-    bool is_speculative_decoding() const;
     bool is_assisting_generation() const;
     bool is_prompt_lookup() const;
-    void update_generation_config(const ov::AnyMap& config_map);
+
+    OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release")
+    bool is_speculative_decoding() const;
+
+    void update_generation_config(const ov::AnyMap& properties);
 
     template <typename... Properties>
     util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
@@ -187,8 +190,13 @@ static constexpr ov::Property<float> assistant_confidence_threshold{"assistant_c
 static constexpr ov::Property<size_t> num_assistant_tokens{"num_assistant_tokens"};
 
 // Predefined Configs
+
+OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
 OPENVINO_GENAI_EXPORTS GenerationConfig beam_search();
+OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
 OPENVINO_GENAI_EXPORTS GenerationConfig greedy();
+OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
 OPENVINO_GENAI_EXPORTS GenerationConfig multinomial();
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp
@@ -46,8 +46,6 @@ class CacheManager {
         }
         OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size());
         m_num_allocated_kv_blocks = num_kv_blocks;
-        ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks);
-        ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks);
 
         const std::string device_name = m_device_config.get_device();
 
@@ -56,6 +54,8 @@ class CacheManager {
 
         if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
+                ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
+                ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
                 ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape);
                 ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape);
 
@@ -104,6 +104,8 @@ class CacheManager {
         } else {
             auto remote_context = m_core.get_default_context(device_name);
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
+                ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
+                ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
                 ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
                                                                     key_cache_shape);
                 ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
@@ -142,30 +144,27 @@ class CacheManager {
     }
 
     void copy_blocks(const std::map<size_t, std::list<size_t>>& block_copy_map) {
-        ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks);
-        ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks);
-
-        ov::Coordinate key_src_start_roi(key_shape.size(), 0);
-        ov::Coordinate key_src_end_roi = key_shape;
-        ov::Coordinate key_dst_start_roi(key_shape.size(), 0);
-        ov::Coordinate key_dst_end_roi = key_shape;
-
-        ov::Coordinate value_src_start_roi(value_shape.size(), 0);
-        ov::Coordinate value_src_end_roi = value_shape;
-        ov::Coordinate value_dst_start_roi(value_shape.size(), 0);
-        ov::Coordinate value_dst_end_roi = value_shape;
-
         for (const auto & blocks_pair : block_copy_map) {
             size_t src_block_id = blocks_pair.first;
-            key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1;
-            value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1;
-
             const std::list<size_t>& dst_block_ids = blocks_pair.second;
             for (size_t dst_block_id : dst_block_ids) {
-                key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
-                value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;
-
                 for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
+                    ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
+                    ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
+                    ov::Coordinate key_src_start_roi(key_shape.size(), 0);
+                    ov::Coordinate key_src_end_roi = key_shape;
+                    ov::Coordinate key_dst_start_roi(key_shape.size(), 0);
+                    ov::Coordinate key_dst_end_roi = key_shape;
+
+                    ov::Coordinate value_src_start_roi(value_shape.size(), 0);
+                    ov::Coordinate value_src_end_roi = value_shape;
+                    ov::Coordinate value_dst_start_roi(value_shape.size(), 0);
+                    ov::Coordinate value_dst_end_roi = value_shape;
+                    key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1;
+                    value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1;
+                    key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
+                    value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;
+
                     ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi);
                     ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi);