Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU][PoC] Common shape info buffer #28167

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ struct network {
std::map<primitive_id, network_output> execute(const std::vector<event::ptr>& dependencies = {});

void validate_primitives();
void preallocate_shape_info_buffers();
void set_arguments();
// Implementation specific calls
bool does_node_need_lockable_output(const primitive_id& id) const;
Expand Down Expand Up @@ -215,6 +216,7 @@ struct network {
bool _is_dynamic = false;
bool _enable_profiling = false;
bool _reset_arguments;
memory::ptr _ptr;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to hold the pointer to the base buffer in the sub-buffer itself, thus their lifetimes will be better aligned


std::unordered_map<primitive_id, std::shared_ptr<primitive_inst>> _primitives;
std::vector<shared_mem_type> _in_out_shared_mem_types;
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class engine {
/// Created memory object from memory @p params and reinterpred the data using specified @p layout
virtual memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) = 0;

virtual memory_ptr reinterpret_subbuffer(const memory& memory, const layout& new_layout, size_t offset) = 0;

/// Created memory object from the other @p memory and reinterpred the data using specified @p new_layout
virtual memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) = 0;

Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ class primitive_inst {
}

memory::ptr shape_info_memory_ptr() const { return _shape_info_memory; }
void set_shape_info_memory_ptr(memory::ptr addr);

void add_dep_events(const std::vector<event::ptr>& events);
void add_dep_event(event::ptr ev);
Expand Down
30 changes: 30 additions & 0 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
build_insts_deps();
build_exec_order();
validate_primitives();
preallocate_shape_info_buffers();
add_default_output_chains();
}

Expand Down Expand Up @@ -275,6 +276,35 @@ void network::validate_primitives() {
}
}

void network::preallocate_shape_info_buffers() {
GPU_DEBUG_DEFINE_MEM_LOGGER("preallocate_shape_info_buffers");
int64_t sum = 0;

for (auto const& prim : _exec_order) {
auto& node = prim->get_node();
int64_t shape_elements = node.get_total_shape_info_size();
sum += shape_elements;
std::cout << "shape_elements " << shape_elements << std::endl;
prim->set_shape_info_memory_ptr(nullptr);
}

std::cout << "Sum of shape elements " << sum << std::endl;

if(sum) {
auto& eng = get_engine();
_ptr = eng.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false);
int new_sum = 0;
for (auto const& prim : _exec_order) {
auto& node = prim->get_node();
int64_t shape_elements = node.get_total_shape_info_size();
if(shape_elements == 0) continue;
auto new_mem = eng.reinterpret_subbuffer(*_ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, new_sum);
prim->set_shape_info_memory_ptr(new_mem);
new_sum += shape_elements * 4;
}
}
}

void network::set_arguments() {
if (!_reset_arguments)
return;
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1125,6 +1125,10 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la
}
}

void primitive_inst::set_shape_info_memory_ptr(memory::ptr addr) {
_shape_info_memory = addr;
}

void primitive_inst::allocate_shape_info_memory() {
int64_t shape_elements = _node->get_total_shape_info_size();
_shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);
Expand Down
36 changes: 36 additions & 0 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,42 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
}
}

memory::ptr ocl_engine::reinterpret_subbuffer(const memory& memory, const layout& new_layout, size_t offset) {
OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine");
OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(),
"[GPU] trying to reinterpret between image and non-image layouts. Current: ",
memory.get_layout().format.to_string(), " Target: ", new_layout.format.to_string());

try {
if (new_layout.format.is_image_2d()) {
return std::make_shared<ocl::gpu_image2d>(this,
new_layout,
reinterpret_cast<const ocl::gpu_image2d&>(memory).get_buffer(),
memory.get_mem_tracker());
} else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
auto& new_buf = reinterpret_cast<const ocl::gpu_usm&>(memory);
auto helper = new_buf.get_buffer()._usmHelper;
auto ptr = new_buf.get_buffer().get();
ptr = (char*) ptr + offset;
auto buffer = cl::UsmMemory(helper, ptr);

return std::make_shared<ocl::gpu_usm>(this,
new_layout,
buffer,
memory.get_allocation_type(),
memory.get_mem_tracker());
} else {
return std::make_shared<ocl::gpu_buffer>(this,
new_layout,
reinterpret_cast<const ocl::gpu_buffer&>(memory).get_buffer(),
memory.get_mem_tracker());
}
} catch (cl::Error const& err) {
OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
}
}


memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) {
OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine");
OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(),
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class ocl_engine : public engine {

memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override;
memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override;
memory_ptr reinterpret_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override;
memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
bool check_allocatable(const layout& layout, allocation_type type) override;
Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,9 +903,10 @@ class UsmHolder {
~UsmHolder() {
memFree();
}

void* _ptr;
private:
const cl::UsmHelper& _usmHelper;
void* _ptr;
bool _shared_memory = false;
};
/*
Expand All @@ -922,6 +923,10 @@ class UsmMemory {
}
}

void add_offset(size_t offsetPtr) const {
_usm_pointer->_ptr = (void*) ((unsigned char*) _usm_pointer->_ptr + offsetPtr);
}
Comment on lines +926 to +928
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably we shouldn't expose this function because it allows shifting the pointer of any UsmMemory. This pointer is also used for free_mem() call, and I'm not sure it will be properly handled if shifted accidently for the not-shared memory buffer. Maybe it would be better to limit shifting only with specialized c-tr like UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr, size_t offset = 0), which creates UsmHolder with shared_memory=true option


// Get methods returns original pointer allocated by openCL.
void* get() const { return _usm_pointer->ptr(); }

Expand Down Expand Up @@ -953,9 +958,9 @@ class UsmMemory {
}

virtual ~UsmMemory() = default;
const UsmHelper& _usmHelper;

protected:
const UsmHelper& _usmHelper;
std::shared_ptr<UsmHolder> _usm_pointer = nullptr;

private:
Expand Down
Loading