diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index a416ca51233893..8a1dba408fbd11 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -412,6 +412,14 @@ static constexpr ov::Property max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN" */ static constexpr ov::Property min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"}; +/** + * @brief + * Type: ov::AnyMap. + * Tell NPUW the configuration for compilation of prefill model. + * NOTE: !! Write-only !! + */ +static constexpr ov::Property prefill_config{"NPUW_LLM_PREFILL_CONFIG"}; + /** * @brief * Type: std::string. @@ -421,6 +429,13 @@ static constexpr ov::Property min_response_len{"NPUW_LLM_MIN_RESPONSE_ */ static constexpr ov::Property generate_hint{"NPUW_LLM_GENERATE_HINT"}; +/** + * @brief + * Type: ov::AnyMap. + * Tell NPUW the configuration for compilation of generate model. + * NOTE: !! Write-only !! + */ +static constexpr ov::Property generate_config{"NPUW_LLM_GENERATE_CONFIG"}; } // namespace llm } // namespace npuw diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index eb13bc8b5bd1d9..496fd8bd8fd8d7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -321,6 +321,15 @@ std::optional extract_npu_descriptor(const std::shared_ptr(), max_tiles.as()}); } +std::optional pop_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + std::optional found = std::make_optional(it->second); + config.erase(it); + return found; + } + return std::nullopt; +} + ov::AnyMap get_baseline_common_config() { ov::AnyMap config = { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"}, @@ -418,6 +427,13 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m std::map npuw_llm_props; std::map other_props; split_llm_properties(properties, npuw_llm_props, other_props); + + // Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map, + // to not pass them into ::intel_npu::Config object, as we don't need to + // preserve them somewhere. + auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_PREFILL_CONFIG")); + auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG")); + m_cfg.update(any_copy(npuw_llm_props)); LOG_DEBUG("1. Creating kvcache model as clone of passed one."); @@ -455,17 +471,20 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m prefill_model = cvt_kvcache_to_fp16(prefill_model); auto npudesc = extract_npu_descriptor(plugin); - ov::AnyMap properties_copy = other_props; - auto prefill_config = get_default_prefill_config(model, npudesc); + auto prefill_config = + prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as(); - // NB: GENERATE_HINT is only applicable for default generate config! const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); - LOG_DEBUG( - "10. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); - auto generate_config = get_default_generate_config(model, npudesc, generate_hint); + LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); + // NB: GENERATE_HINT is only applicable for default generate config! + if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) { + OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!"); + } + auto generate_config = + generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as(); - merge_config_with(prefill_config, properties_copy); - merge_config_with(generate_config, properties_copy); + merge_config_with(prefill_config, other_props); + merge_config_with(generate_config, other_props); m_kvcache_compiled = std::make_shared(kvcache_model, plugin, generate_config); m_prefill_compiled = std::make_shared(prefill_model, plugin, prefill_config); @@ -488,6 +507,11 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) { ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const { OPENVINO_SUPPRESS_DEPRECATED_START + if (name == ov::intel_npu::npuw::llm::prefill_config.name() || + name == ov::intel_npu::npuw::llm::generate_config.name()) { + OPENVINO_THROW(name, " is write-only option!"); + } + auto&& configIterator = m_prop_to_opt.find(name); if (configIterator != m_prop_to_opt.cend()) { return std::get<1>(configIterator->second)(m_cfg); @@ -504,7 +528,7 @@ std::shared_ptr ov::npuw::LLMCompiledModel::create_sync_i std::shared_ptr ov::npuw::LLMCompiledModel::create_llm_infer_request() { auto this_sptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared(this_sptr, m_kvcache_desc); + return std::make_shared(this_sptr); } void ov::npuw::LLMCompiledModel::implement_properties() { diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 12f103cc0ab6a2..0bad68a35aa4ba 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -59,10 +59,9 @@ void copy_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& compiled_model, - const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc) +ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr& compiled_model) : ov::ISyncInferRequest(compiled_model), - m_kvcache_desc(kvcache_desc) { + m_npuw_llm_compiled_model(compiled_model) { m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request(); m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request(); @@ -82,13 +81,11 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u); - fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u); - fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u); - fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u); - m_kvcache_desc.num_stored_tokens = 0u; + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0); + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0); + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0); + fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0); + m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u; } void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, @@ -112,7 +109,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, std::copy_n(position_ids->data(), position_ids->get_size(), padded_position_ids->data() + offset); m_prefill_request->infer(); - m_kvcache_desc.num_stored_tokens += static_cast(input_ids->get_size()); + m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast(input_ids->get_size()); m_need_copy_kvcache = true; m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits")); @@ -126,8 +123,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, LOG_DEBUG("Calling inference for generate model..."); LOG_BLOCK(); + auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; // NB: KV-cache is full, further generation is impossible - if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { + if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) { OPENVINO_THROW("KV-Cache is full."); } @@ -146,17 +144,16 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // taking into account kvcache dimension. fill_tensor(kvcache_in_tensor, 0); - const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed) + const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed) ? 3u - : m_kvcache_desc.dim; + : kvcache_desc.dim; - auto prefill_out_slice = - make_tensor_slice(prefill_out_tensor, - kv_dim, - m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, - m_kvcache_desc.max_prompt_size); + auto prefill_out_slice = make_tensor_slice(prefill_out_tensor, + kv_dim, + kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens, + kvcache_desc.max_prompt_size); - auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens); + auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, kvcache_desc.num_stored_tokens); if (kv_dim == 3u) { copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice); @@ -168,7 +165,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, LOG_DEBUG("Prepare attention mask pattern."); auto* attention_mask_data = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data(); - attention_mask_data[m_kvcache_desc.total_size - 1] = 1; + attention_mask_data[kvcache_desc.total_size - 1] = 1; m_need_copy_kvcache = false; } @@ -185,7 +182,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, m_kvcache_request->infer(); m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits")); - m_kvcache_desc.num_stored_tokens += 1; + kvcache_desc.num_stored_tokens += 1; LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration."); const std::size_t kStartOutputKVCacheLayers = 1u; @@ -194,13 +191,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name(); const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); - const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed) + const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed) ? 3u - : m_kvcache_desc.dim; + : kvcache_desc.dim; auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, - m_kvcache_desc.num_stored_tokens - 1, - m_kvcache_desc.num_stored_tokens); + kvcache_desc.num_stored_tokens - 1, + kvcache_desc.num_stored_tokens); auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name)); kvcache_out_tensor->copy_to(kvcache_in_slice._ptr); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp index fbc6c702c4b62a..ea0b83d271b58e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -15,8 +15,7 @@ namespace npuw { class LLMInferRequest final : public ov::ISyncInferRequest { public: - explicit LLMInferRequest(const std::shared_ptr& compiled_model, - const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc); + explicit LLMInferRequest(const std::shared_ptr& compiled_model); void infer() override; @@ -44,7 +43,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest { std::shared_ptr m_kvcache_request; std::shared_ptr m_prefill_request; - LLMCompiledModel::KVCacheDesc m_kvcache_desc; + std::shared_ptr m_npuw_llm_compiled_model; ov::SoPtr m_logits; bool m_need_copy_kvcache = false;