Skip to content

Commit

Permalink
Merge branch 'master' into use-continuos-batching-by-default
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov authored Dec 28, 2024
2 parents 17d4333 + 6c56a7b commit ca58d3a
Show file tree
Hide file tree
Showing 50 changed files with 1,408 additions and 1,031 deletions.
29 changes: 20 additions & 9 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,20 @@
- 'src/python/py_tokenizer.cpp'
- 'thirdparty/openvino_tokenizers'
- 'tests/python_tests/tokenizer_configs.py'
- 'tests/python_tests/test_tokenizer.py'

'category: LLM':
- 'src/cpp/include/openvino/genai/llm_pipeline.hpp'
- 'src/cpp/src/llm_pipeline.cpp'
- 'src/cpp/src/lm_encoding.hpp'
- 'src/cpp/src/lm_encoding.cpp'
- 'src/cpp/src/llm_pipeline_base.hpp'
- 'src/cpp/src/llm_pipeline_static.hpp'
- 'src/cpp/src/llm_pipeline_static.cpp'
- 'src/cpp/src/text_callback_streamer.cpp'
- 'src/cpp/src/text_callback_streamer.hpp'
- 'src/python/py_llm_pipeline.cpp'
- 'tests/python_tests/test_generate_api.py'
- 'tests/python_tests/test_chat_generate_api.py'
- 'tests/python_tests/test_llm_pipeline.py'

'category: sampling':
- 'src/cpp/include/openvino/genai/generation_config.hpp'
Expand All @@ -35,6 +38,7 @@
- 'tests/cpp/logit_filtering.cpp'
- 'tests/cpp/generate_config.cpp'
- 'tests/cpp/sampler.cpp'
- 'tests/python_tests/test_sampling.py'

'category: LoRA':
- 'src/cpp/include/openvino/genai/lora_adapter.hpp'
Expand All @@ -54,9 +58,12 @@
- 'src/cpp/include/openvino/genai/whisper_pipeline.hpp'
- 'src/cpp/src/whisper/**/*'
- 'src/cpp/src/whisper_generation_config.cpp'
- 'src/cpp/src/whisper_pipeline_base.hpp'
- 'src/cpp/src/whisper_pipeline.cpp'
- 'src/cpp/src/whisper_pipeline_static.cpp'
- 'src/cpp/src/whisper_pipeline_static.hpp'
- 'src/python/py_whisper_pipeline.cpp'
- 'tests/python_tests/test_whisper_generate_api.py'
- 'tests/python_tests/test_whisper_pipeline.py'

'category: Python API':
- 'src/python/**/*'
Expand All @@ -65,10 +72,14 @@
- 'src/include/openvino/genai/visual_language/**/*'
- 'src/cpp/src/visual_language/**/*'
- 'src/python/py_vlm_pipeline.cpp'
- 'tests/python_tests/test_vlm_api.py'
- 'tests/python_tests/test_vlm_pipeline.py'

'category: speculative decoding':
- 'src/cpp/src/speculative_decoding/**/*'
- 'tests/cpp/speculative_decoding.cpp'

'category: prompt lookup':
- 'src/cpp/src/prompt_lookup/**/*'

'category: continuous batching':
- 'src/cpp/include/openvino/genai/cache_eviction.hpp'
Expand All @@ -91,19 +102,19 @@
- 'src/cpp/src/generation_handle.cpp'
- 'src/cpp/src/generation_stream.hpp'
- 'src/cpp/src/model_runner.hpp'
- 'src/cpp/src/paged_attention_transformations.cpp'
- 'src/cpp/src/paged_attention_transformations.hpp'
- 'src/cpp/src/utils/paged_attention_transformations.cpp'
- 'src/cpp/src/utils/paged_attention_transformations.hpp'
- 'src/cpp/src/scheduler.hpp'
- 'src/cpp/src/sequence_group.cpp'
- 'src/cpp/src/sequence_group.hpp'
- 'src/cpp/src/timer.hpp'
- 'src/python/py_continuous_batching_pipeline.cpp'
- 'tests/python_tests/test_cache_optimizations.py'
- 'tests/python_tests/test_preemption.py'
- 'tests/python_tests/test_sampling.py'
- 'tests/python_tests/test_continuous_batching.py'
- 'tests/python_tests/test_kv_cache_eviction.py'
- 'tests/cpp/block_allocator.cpp'
- 'tests/cpp/block_hash_store.cpp'
- 'tests/cpp/block_manager.cpp'
- 'tests/cpp/cache_eviction.cpp'
- 'tests/cpp/cache_manager.cpp'
- 'tests/cpp/device_config.cpp'
- 'tests/cpp/scheduler.cpp'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,9 @@ jobs:
matrix:
test:
- name: 'Whisper'
cmd: 'tests/python_tests/test_whisper_generate_api.py'
cmd: 'tests/python_tests/test_whisper_pipeline.py'
- name: 'LLM & VLM'
cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py'
cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_pipeline.py'
defaults:
run:
shell: bash
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ jobs:
if: |
always() &&
(needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
timeout-minutes: 90
timeout-minutes: 120
defaults:
run:
shell: bash
Expand Down Expand Up @@ -235,7 +235,7 @@ jobs:
python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
python -c "from openvino_genai import LLMPipeline"
python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template"
genai_python_lib_whisper:
name: OpenVINO genai extension whisper tests (cmake + wheel)
Expand Down Expand Up @@ -290,7 +290,7 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke
env:
PYTHONPATH: "./build/:$PYTHONPATH"

Expand All @@ -300,7 +300,7 @@ jobs:
python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
python -c "from openvino_genai import LLMPipeline"
python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
genai_package:
name: OpenVINO genai extension (install to OpenVINO package)
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ jobs:
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template"
genai_python_lib_whisper:
name: OpenVINO genai extension whisper tests (cmake + wheel)
Expand Down Expand Up @@ -300,7 +300,7 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke
env:
PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.

Expand All @@ -309,7 +309,7 @@ jobs:
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
genai_python_lib_vlm:
name: OpenVINO genai VLM tests (cmake + wheel)
Expand Down Expand Up @@ -365,7 +365,7 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/test_vlm_api.py
python -m pytest -v ./tests/python_tests/test_vlm_pipeline.py
env:
PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ options = {"BUILD_TOKENIZERS" = "OFF"}

[build-system]
requires = [
"py-build-cmake==0.3.3",
"py-build-cmake==0.3.4",
"openvino~=2025.0.0.0.dev",
"pybind11-stubgen==2.5.1",
"cmake~=3.23.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try {
config.max_new_tokens = 20;
config.num_beam_groups = 3;
config.num_beams = 15;
config.diversity_penalty = 1.0f;
config.num_return_sequences = config.num_beams;

// Since the streamer is set, the results will
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def main():
config.max_new_tokens = 20
config.num_beam_groups = 3
config.num_beams = 15
config.diversity_penalty = 1
config.num_return_sequences = config.num_beams

beams = pipe.generate(args.prompts, config)
Expand Down
28 changes: 18 additions & 10 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,22 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
bool echo = false;
size_t logprobs = 0;

// EOS special token
int64_t eos_token_id = -1;
std::set<std::string> stop_strings;
// Default setting in vLLM (and OpenAI API) is not to include stop string in the output
bool include_stop_str_in_output = false;
std::set<int64_t> stop_token_ids;

// penalties (not used in beam search)
float repetition_penalty = 1.0f;
float presence_penalty = 0.0;
float frequency_penalty = 0.0f;

// Beam search specific
size_t num_beam_groups = 1;
size_t num_beams = 1;
float diversity_penalty = 1.0f;
float diversity_penalty = 0.0f;
float length_penalty = 1.0f;
size_t num_return_sequences = 1;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
Expand All @@ -112,19 +119,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
float top_p = 1.0f;
size_t top_k = std::numeric_limits<size_t>::max();
bool do_sample = false;
float repetition_penalty = 1.0f;
float presence_penalty = 0.0;
float frequency_penalty = 0.0f;
size_t rng_seed = 0;

// Assisting generation parameters
float assistant_confidence_threshold = 0.f;
size_t num_assistant_tokens = 0;
size_t max_ngram_size = 0;

// EOS special token
int64_t eos_token_id = -1;

std::optional<AdapterConfig> adapters;

/** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
Expand All @@ -136,11 +137,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
bool is_greedy_decoding() const;
bool is_beam_search() const;
bool is_multinomial() const;
OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release")
bool is_speculative_decoding() const;
bool is_assisting_generation() const;
bool is_prompt_lookup() const;
void update_generation_config(const ov::AnyMap& config_map);

OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release")
bool is_speculative_decoding() const;

void update_generation_config(const ov::AnyMap& properties);

template <typename... Properties>
util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
Expand Down Expand Up @@ -187,8 +190,13 @@ static constexpr ov::Property<float> assistant_confidence_threshold{"assistant_c
static constexpr ov::Property<size_t> num_assistant_tokens{"num_assistant_tokens"};

// Predefined Configs

OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
OPENVINO_GENAI_EXPORTS GenerationConfig beam_search();
OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
OPENVINO_GENAI_EXPORTS GenerationConfig greedy();
OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
OPENVINO_GENAI_EXPORTS GenerationConfig multinomial();

} // namespace genai
} // namespace ov
41 changes: 20 additions & 21 deletions src/cpp/src/cache_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ class CacheManager {
}
OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size());
m_num_allocated_kv_blocks = num_kv_blocks;
ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks);
ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks);

const std::string device_name = m_device_config.get_device();

Expand All @@ -56,6 +54,8 @@ class CacheManager {

if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches
for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape);
ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape);

Expand Down Expand Up @@ -104,6 +104,8 @@ class CacheManager {
} else {
auto remote_context = m_core.get_default_context(device_name);
for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
key_cache_shape);
ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
Expand Down Expand Up @@ -142,30 +144,27 @@ class CacheManager {
}

void copy_blocks(const std::map<size_t, std::list<size_t>>& block_copy_map) {
ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks);
ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks);

ov::Coordinate key_src_start_roi(key_shape.size(), 0);
ov::Coordinate key_src_end_roi = key_shape;
ov::Coordinate key_dst_start_roi(key_shape.size(), 0);
ov::Coordinate key_dst_end_roi = key_shape;

ov::Coordinate value_src_start_roi(value_shape.size(), 0);
ov::Coordinate value_src_end_roi = value_shape;
ov::Coordinate value_dst_start_roi(value_shape.size(), 0);
ov::Coordinate value_dst_end_roi = value_shape;

for (const auto & blocks_pair : block_copy_map) {
size_t src_block_id = blocks_pair.first;
key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1;
value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1;

const std::list<size_t>& dst_block_ids = blocks_pair.second;
for (size_t dst_block_id : dst_block_ids) {
key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;

for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
ov::Coordinate key_src_start_roi(key_shape.size(), 0);
ov::Coordinate key_src_end_roi = key_shape;
ov::Coordinate key_dst_start_roi(key_shape.size(), 0);
ov::Coordinate key_dst_end_roi = key_shape;

ov::Coordinate value_src_start_roi(value_shape.size(), 0);
ov::Coordinate value_src_end_roi = value_shape;
ov::Coordinate value_dst_start_roi(value_shape.size(), 0);
ov::Coordinate value_dst_end_roi = value_shape;
key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1;
value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1;
key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;

ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi);
ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi);

Expand Down
Loading

0 comments on commit ca58d3a

Please sign in to comment.