From ef3a5cb2ed6bb1149b00c76341582ed8c3ab43f6 Mon Sep 17 00:00:00 2001 From: Oleh Kurachenko Date: Fri, 16 Sep 2022 16:32:42 +0300 Subject: [PATCH] Fix cpu reorder for binary nchw->nhwc --- src/cpu/reorder/cpu_reorder.cpp | 1 + src/cpu/reorder/cpu_reorder.hpp | 1 + .../reorder/cpu_reorder_regular_f32_bin.cpp | 5 +- .../reorder/cpu_reorder_regular_s32_bin.cpp | 41 ++++++ src/cpu/reorder/simple_reorder.hpp | 125 ++++++++++++++---- 5 files changed, 145 insertions(+), 28 deletions(-) create mode 100644 src/cpu/reorder/cpu_reorder_regular_s32_bin.cpp diff --git a/src/cpu/reorder/cpu_reorder.cpp b/src/cpu/reorder/cpu_reorder.cpp index 269b083e274..6c59ae41d3a 100644 --- a/src/cpu/reorder/cpu_reorder.cpp +++ b/src/cpu/reorder/cpu_reorder.cpp @@ -33,6 +33,7 @@ const static std::map ®ular_impl_list_map() {{f32, bin, 0}, ®ular_f32_bin_impl_list_map()}, {{bf16, data_type::undef, 0}, ®ular_bf16_impl_list_map()}, {{f16, data_type::undef, 0}, ®ular_f16_impl_list_map()}, + {{s32, bin, 0}, ®ular_s32_bin_impl_list_map()}, {{s32, data_type::undef, 0}, ®ular_s32_impl_list_map()}, {{s8, data_type::undef, 0}, ®ular_s8_impl_list_map()}, {{u8, data_type::undef, 0}, ®ular_u8_impl_list_map()}, diff --git a/src/cpu/reorder/cpu_reorder.hpp b/src/cpu/reorder/cpu_reorder.hpp index e3c7d2f786a..71ea598b0c0 100644 --- a/src/cpu/reorder/cpu_reorder.hpp +++ b/src/cpu/reorder/cpu_reorder.hpp @@ -75,6 +75,7 @@ extern const impl_list_map_t ®ular_f32_u8_impl_list_map(); extern const impl_list_map_t ®ular_f32_bin_impl_list_map(); extern const impl_list_map_t ®ular_bf16_impl_list_map(); extern const impl_list_map_t ®ular_f16_impl_list_map(); +extern const impl_list_map_t ®ular_s32_bin_impl_list_map(); extern const impl_list_map_t ®ular_s32_impl_list_map(); extern const impl_list_map_t ®ular_s8_impl_list_map(); extern const impl_list_map_t ®ular_u8_impl_list_map(); diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp index f050b6a648e..717b294702e 100644 --- a/src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp +++ b/src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp @@ -26,10 +26,9 @@ const impl_list_map_t ®ular_f32_bin_impl_list_map() { static const impl_list_map_t the_map = REG_REORDER_P({ // bin -> {{f32, bin, 4}, { - REG_SR_BIDIR(f32, nchw, bin, nhwc) - REG_SR_BIDIR(f32, nhwc, bin, nhwc) + REG_SR(f32, nchw, bin, nhwc, fmt_order_keep) - nullptr, + nullptr, }}, }); return the_map; diff --git a/src/cpu/reorder/cpu_reorder_regular_s32_bin.cpp b/src/cpu/reorder/cpu_reorder_regular_s32_bin.cpp new file mode 100644 index 00000000000..83f6078f947 --- /dev/null +++ b/src/cpu/reorder/cpu_reorder_regular_s32_bin.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* +* Copyright 2022 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "cpu/reorder/cpu_reorder.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { + +// clang-format off + +const impl_list_map_t ®ular_s32_bin_impl_list_map() { + static const impl_list_map_t the_map = REG_REORDER_P({ + // bin -> + {{s32, bin, 4}, { + REG_SR(s32, nchw, bin, nhwc, fmt_order_keep) + + nullptr, + }}, + }); + return the_map; +} + +// clang-format on + +} // namespace cpu +} // namespace impl +} // namespace dnnl \ No newline at end of file diff --git a/src/cpu/reorder/simple_reorder.hpp b/src/cpu/reorder/simple_reorder.hpp index 13080cb2ff5..3d17c148e3b 100644 --- a/src/cpu/reorder/simple_reorder.hpp +++ b/src/cpu/reorder/simple_reorder.hpp @@ -75,7 +75,9 @@ struct conv_req_comp {}; // {s8, u8: asymmetric quantization} const auto &scratchpad = ctx.get_scratchpad_grantor(); \ MAYBE_UNUSED(scratchpad); \ const auto input_d = ctx.memory_mdw(DNNL_ARG_FROM, pd->src_md()); \ + MAYBE_UNUSED(input_d); \ const auto output_d = ctx.memory_mdw(DNNL_ARG_TO, pd->dst_md()); \ + MAYBE_UNUSED(output_d); \ const float alpha = pd->alpha(); \ MAYBE_UNUSED(alpha); \ const float beta = pd->beta(); \ @@ -1759,14 +1761,14 @@ struct simple_reorder_impl struct simple_reorder_impl::type> -{ + typename utils::enable_if::type> { + static bool is_applicable(const memory_desc_wrapper &input_d, - const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { + const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { return simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d) - && simple_attr_check(attr, false, false); + && simple_attr_check(attr, false, false); } GET_SCRATCHPAD_SIZE_ZERO(); @@ -1774,42 +1776,115 @@ typename utils::enable_if<(tag_i == format_tag::nchw || tag_i == format_tag::nhw static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) { DECLARE_COMMON_PARAMS(); + struct logical_index_t { + int n; + int c; + int h; + int w; + }; + + constexpr int nbits = 8; + const auto &dims = input_d.dims(); + const int N = dims[0]; const int C = dims[1]; const int H = dims[2]; const int W = dims[3]; - int nbits = 8; - const int CB = utils::div_up(C, nbits); + auto input_logical_index = [N, C, H, W](const int output_byte_index, + const int output_bit_index) { + const int output_index + = output_byte_index * nbits + output_bit_index; - auto ker = [&](const data_t *i, data_t *o) { - for (int cb = 0; cb < CB; ++cb) { - uint8_t bin_val = 0x00; - for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) { - const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[1]; + const int c_input_index = output_index % C; + const int w_input_index = (output_index / C) % W; + const int h_input_index = (output_index / (C * W)) % H; + const int n_input_index = (output_index / (C * W * H)); - auto bit = uint8_t((i[flat_off] > 0) ? 0x01 : 0x00); - bin_val |= (bit << shift); - } + return logical_index_t { + n_input_index, c_input_index, h_input_index, w_input_index}; + }; - o[cb] = bin_val; + auto ker = [&](const int output_byte_index, const int bits) { + uint8_t bin_val = 0x00; + + for (int i = 0; i < bits; ++i) { + const logical_index_t input_logical_idx + = input_logical_index(output_byte_index, i); + const int input_index = input_d.blk_off(input_logical_idx.n, + input_logical_idx.c, input_logical_idx.h, + input_logical_idx.w); + auto bit = uint8_t((input[input_index] > 0) ? 0x01 : 0x00); + bin_val |= (bit << i); } - }; - parallel_nd(dims[0], H, W, - [&](int n, int h, int w) { - auto iidx = input_d.blk_off(n, 0, h, w); - auto oidx = output_d.blk_off(n, 0, h, w); + output[output_byte_index] = bin_val; + }; - auto i = &input[iidx]; - auto o = &output[oidx / nbits]; - ker(i, o); + const int output_size = utils::div_up(N * C * H * W, nbits); + parallel_nd(output_size, [&](int output_byte_index) { + ker(output_byte_index, + std::min(nbits, + (N * C * H * W) - output_byte_index * nbits)); }); return status::success; } }; +//template +//struct simple_reorder_impl::type> +//{ +// static bool is_applicable(const memory_desc_wrapper &input_d, +// const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { +// return simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d) +// && simple_attr_check(attr, false, false); +// } +// +// GET_SCRATCHPAD_SIZE_ZERO(); +// +// static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) { +// DECLARE_COMMON_PARAMS(); +// +// const auto &dims = input_d.dims(); +// const int C = dims[1]; +// const int H = dims[2]; +// const int W = dims[3]; +// +// int nbits = 8; +// const int CB = utils::div_up(C, nbits); +// +// auto ker = [&](const data_t *i, data_t *o) { +// for (int cb = 0; cb < CB; ++cb) { +// uint8_t bin_val = 0x00; +// for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) { +// const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[1]; +// +// auto bit = uint8_t((i[flat_off] > 0) ? 0x01 : 0x00); +// bin_val |= (bit << shift); +// } +// +// o[cb] = bin_val; +// } +// }; +// +// parallel_nd(dims[0], H, W, +// [&](int n, int h, int w) { +// auto iidx = input_d.blk_off(n, 0, h, w); +// auto oidx = output_d.blk_off(n, 0, h, w); +// +// auto i = &input[iidx]; +// auto o = &output[oidx / nbits]; +// ker(i, o); +// }); +// +// return status::success; +// } +//}; + template struct simple_reorder_impl