From 940b0cb71005445b2e750168a523e47a1d4c4ba0 Mon Sep 17 00:00:00 2001 From: Kurek Date: Fri, 13 Dec 2024 08:50:44 +0100 Subject: [PATCH 1/2] [GPU][WIP] Common shape_info buffer --- .../include/intel_gpu/graph/network.hpp | 1 + .../include/intel_gpu/runtime/engine.hpp | 2 ++ .../src/graph/include/primitive_inst.h | 1 + src/plugins/intel_gpu/src/graph/network.cpp | 31 ++++++++++++++++ .../intel_gpu/src/graph/primitive_inst.cpp | 4 +++ .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 36 +++++++++++++++++++ .../intel_gpu/src/runtime/ocl/ocl_engine.hpp | 1 + .../intel_gpu/src/runtime/ocl/ocl_ext.hpp | 9 +++-- 8 files changed, 83 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 5b7873c1500638..df30f17997ef09 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -160,6 +160,7 @@ struct network { std::map execute(const std::vector& dependencies = {}); void validate_primitives(); + void preallocate_shape_info_buffers(); void set_arguments(); // Implementation specific calls bool does_node_need_lockable_output(const primitive_id& id) const; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index 9638bf5fbf8379..e60bfa80f4b236 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -57,6 +57,8 @@ class engine { /// Created memory object from memory @p params and reinterpred the data using specified @p layout virtual memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) = 0; + virtual memory_ptr reinterpret_subbuffer(const memory& memory, const layout& new_layout, size_t offset) = 0; + /// Created memory object from the other @p memory and reinterpred the data using specified @p new_layout virtual memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) = 0; diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index fffcd100d9691c..89edd3223928a4 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -232,6 +232,7 @@ class primitive_inst { } memory::ptr shape_info_memory_ptr() const { return _shape_info_memory; } + void set_shape_info_memory_ptr(memory::ptr addr); void add_dep_events(const std::vector& events); void add_dep_event(event::ptr ev); diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 37152b0d9e3b4f..c12c6b677424ad 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -207,6 +207,7 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo build_insts_deps(); build_exec_order(); validate_primitives(); + preallocate_shape_info_buffers(); add_default_output_chains(); } @@ -275,6 +276,36 @@ void network::validate_primitives() { } } +void network::preallocate_shape_info_buffers() { + GPU_DEBUG_DEFINE_MEM_LOGGER("preallocate_shape_info_buffers"); + int64_t sum = 0; + + for (auto const& prim : _exec_order) { + auto& node = prim->get_node(); + int64_t shape_elements = node.get_total_shape_info_size(); + sum += shape_elements; + std::cout << "shape_elements " << shape_elements << std::endl; + prim->set_shape_info_memory_ptr(nullptr); + } + + std::cout << "Sum of shape elements " << sum << std::endl; + if(sum) { + auto& eng = get_engine(); + auto ptr = eng.allocate_memory(layout{{sum * 128}, data_types::i32, format::bfyx}, false); + //auto ptr = eng.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false); + int new_sum = 0; + for (auto const& prim : _exec_order) { + auto& node = prim->get_node(); + int64_t shape_elements = node.get_total_shape_info_size(); + if(shape_elements == 0) continue; + auto new_ptr = ptr; + auto new_mem = eng.reinterpret_subbuffer(*ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, new_sum); + prim->set_shape_info_memory_ptr(new_mem); + new_sum += 4096; + } + } +} + void network::set_arguments() { if (!_reset_arguments) return; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index b51c7825b5a8fa..adda78921cdf9e 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -1125,6 +1125,10 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la } } +void primitive_inst::set_shape_info_memory_ptr(memory::ptr addr) { + _shape_info_memory = addr; +} + void primitive_inst::allocate_shape_info_memory() { int64_t shape_elements = _node->get_total_shape_info_size(); _shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index df1cad281d636c..c6b3d14e2c86da 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -208,6 +208,42 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty } } +memory::ptr ocl_engine::reinterpret_subbuffer(const memory& memory, const layout& new_layout, size_t offset) { + OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine"); + OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(), + "[GPU] trying to reinterpret between image and non-image layouts. Current: ", + memory.get_layout().format.to_string(), " Target: ", new_layout.format.to_string()); + + try { + if (new_layout.format.is_image_2d()) { + return std::make_shared(this, + new_layout, + reinterpret_cast(memory).get_buffer(), + memory.get_mem_tracker()); + } else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) { + auto& new_buf = reinterpret_cast(memory); + auto helper = new_buf.get_buffer()._usmHelper; + auto ptr = new_buf.get_buffer().get(); + ptr = (char*) ptr + offset; + auto buffer = cl::UsmMemory(helper, ptr); + + return std::make_shared(this, + new_layout, + buffer, + memory.get_allocation_type(), + memory.get_mem_tracker()); + } else { + return std::make_shared(this, + new_layout, + reinterpret_cast(memory).get_buffer(), + memory.get_mem_tracker()); + } + } catch (cl::Error const& err) { + OPENVINO_THROW(OCL_ERR_MSG_FMT(err)); + } +} + + memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) { OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine"); OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(), diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index 2f6541bf0e7e00..2d78fc049e7e73 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -26,6 +26,7 @@ class ocl_engine : public engine { memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override; memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override; + memory_ptr reinterpret_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override; memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; bool check_allocatable(const layout& layout, allocation_type type) override; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp index 759d796a5e87e8..9006fa03bcc111 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp @@ -903,9 +903,10 @@ class UsmHolder { ~UsmHolder() { memFree(); } + + void* _ptr; private: const cl::UsmHelper& _usmHelper; - void* _ptr; bool _shared_memory = false; }; /* @@ -922,6 +923,10 @@ class UsmMemory { } } + void add_offset(size_t offsetPtr) const { + _usm_pointer->_ptr = (void*) ((unsigned char*) _usm_pointer->_ptr + offsetPtr); + } + // Get methods returns original pointer allocated by openCL. void* get() const { return _usm_pointer->ptr(); } @@ -953,9 +958,9 @@ class UsmMemory { } virtual ~UsmMemory() = default; + const UsmHelper& _usmHelper; protected: - const UsmHelper& _usmHelper; std::shared_ptr _usm_pointer = nullptr; private: From 498b793e8fb8c4d9e4ec9e1af3fcadac6ae87bf4 Mon Sep 17 00:00:00 2001 From: Kurek Date: Fri, 20 Dec 2024 10:53:50 +0100 Subject: [PATCH 2/2] [GPU][PoC] shape_info common fix --- .../intel_gpu/include/intel_gpu/graph/network.hpp | 1 + src/plugins/intel_gpu/src/graph/network.cpp | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index df30f17997ef09..fe78f550df5b46 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -216,6 +216,7 @@ struct network { bool _is_dynamic = false; bool _enable_profiling = false; bool _reset_arguments; + memory::ptr _ptr; std::unordered_map> _primitives; std::vector _in_out_shared_mem_types; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index c12c6b677424ad..60672b32850f0a 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -289,19 +289,18 @@ void network::preallocate_shape_info_buffers() { } std::cout << "Sum of shape elements " << sum << std::endl; + if(sum) { auto& eng = get_engine(); - auto ptr = eng.allocate_memory(layout{{sum * 128}, data_types::i32, format::bfyx}, false); - //auto ptr = eng.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false); + _ptr = eng.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false); int new_sum = 0; for (auto const& prim : _exec_order) { auto& node = prim->get_node(); int64_t shape_elements = node.get_total_shape_info_size(); if(shape_elements == 0) continue; - auto new_ptr = ptr; - auto new_mem = eng.reinterpret_subbuffer(*ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, new_sum); + auto new_mem = eng.reinterpret_subbuffer(*_ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, new_sum); prim->set_shape_info_memory_ptr(new_mem); - new_sum += 4096; + new_sum += shape_elements * 4; } } }