Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix cpu reorder for binary nchw->nhwc #147

Draft
wants to merge 1 commit into
base: v2.6_for_ie_master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/cpu/reorder/cpu_reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ const static std::map<reorder_impl_key_t, const void *> &regular_impl_list_map()
{{f32, bin, 0}, &regular_f32_bin_impl_list_map()},
{{bf16, data_type::undef, 0}, &regular_bf16_impl_list_map()},
{{f16, data_type::undef, 0}, &regular_f16_impl_list_map()},
{{s32, bin, 0}, &regular_s32_bin_impl_list_map()},
{{s32, data_type::undef, 0}, &regular_s32_impl_list_map()},
{{s8, data_type::undef, 0}, &regular_s8_impl_list_map()},
{{u8, data_type::undef, 0}, &regular_u8_impl_list_map()},
Expand Down
1 change: 1 addition & 0 deletions src/cpu/reorder/cpu_reorder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ extern const impl_list_map_t &regular_f32_u8_impl_list_map();
extern const impl_list_map_t &regular_f32_bin_impl_list_map();
extern const impl_list_map_t &regular_bf16_impl_list_map();
extern const impl_list_map_t &regular_f16_impl_list_map();
extern const impl_list_map_t &regular_s32_bin_impl_list_map();
extern const impl_list_map_t &regular_s32_impl_list_map();
extern const impl_list_map_t &regular_s8_impl_list_map();
extern const impl_list_map_t &regular_u8_impl_list_map();
Expand Down
5 changes: 2 additions & 3 deletions src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@ const impl_list_map_t &regular_f32_bin_impl_list_map() {
static const impl_list_map_t the_map = REG_REORDER_P({
// bin ->
{{f32, bin, 4}, {
REG_SR_BIDIR(f32, nchw, bin, nhwc)
REG_SR_BIDIR(f32, nhwc, bin, nhwc)
REG_SR(f32, nchw, bin, nhwc, fmt_order_keep)

nullptr,
nullptr,
}},
});
return the_map;
Expand Down
41 changes: 41 additions & 0 deletions src/cpu/reorder/cpu_reorder_regular_s32_bin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*******************************************************************************
* Copyright 2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "cpu/reorder/cpu_reorder.hpp"

namespace dnnl {
namespace impl {
namespace cpu {

// clang-format off

const impl_list_map_t &regular_s32_bin_impl_list_map() {
static const impl_list_map_t the_map = REG_REORDER_P({
// bin ->
{{s32, bin, 4}, {
REG_SR(s32, nchw, bin, nhwc, fmt_order_keep)

nullptr,
}},
});
return the_map;
}

// clang-format on

} // namespace cpu
} // namespace impl
} // namespace dnnl
125 changes: 100 additions & 25 deletions src/cpu/reorder/simple_reorder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ struct conv_req_comp {}; // {s8, u8: asymmetric quantization}
const auto &scratchpad = ctx.get_scratchpad_grantor(); \
MAYBE_UNUSED(scratchpad); \
const auto input_d = ctx.memory_mdw(DNNL_ARG_FROM, pd->src_md()); \
MAYBE_UNUSED(input_d); \
const auto output_d = ctx.memory_mdw(DNNL_ARG_TO, pd->dst_md()); \
MAYBE_UNUSED(output_d); \
const float alpha = pd->alpha(); \
MAYBE_UNUSED(alpha); \
const float beta = pd->beta(); \
Expand Down Expand Up @@ -1759,57 +1761,130 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,

template <SIMPLE_REORDER_TEMPL_DECL>
struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
typename utils::enable_if<(tag_i == format_tag::nchw || tag_i == format_tag::nhwc) &&
tag_o == format_tag::nhwc &&
(type_i == dnnl_bin || type_o == dnnl_bin)>::type>
{
typename utils::enable_if<tag_i == format_tag::nchw
&& tag_o == format_tag::nhwc && type_o == dnnl_bin
&& order_keep>::type> {

static bool is_applicable(const memory_desc_wrapper &input_d,
const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
return simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d)
&& simple_attr_check(attr, false, false);
&& simple_attr_check(attr, false, false);
}

GET_SCRATCHPAD_SIZE_ZERO();

static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) {
DECLARE_COMMON_PARAMS();

struct logical_index_t {
int n;
int c;
int h;
int w;
};

constexpr int nbits = 8;

const auto &dims = input_d.dims();
const int N = dims[0];
const int C = dims[1];
const int H = dims[2];
const int W = dims[3];

int nbits = 8;
const int CB = utils::div_up(C, nbits);
auto input_logical_index = [N, C, H, W](const int output_byte_index,
const int output_bit_index) {
const int output_index
= output_byte_index * nbits + output_bit_index;

auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
for (int cb = 0; cb < CB; ++cb) {
uint8_t bin_val = 0x00;
for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) {
const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[1];
const int c_input_index = output_index % C;
const int w_input_index = (output_index / C) % W;
const int h_input_index = (output_index / (C * W)) % H;
const int n_input_index = (output_index / (C * W * H));

auto bit = uint8_t((i[flat_off] > 0) ? 0x01 : 0x00);
bin_val |= (bit << shift);
}
return logical_index_t {
n_input_index, c_input_index, h_input_index, w_input_index};
};

o[cb] = bin_val;
auto ker = [&](const int output_byte_index, const int bits) {
uint8_t bin_val = 0x00;

for (int i = 0; i < bits; ++i) {
const logical_index_t input_logical_idx
= input_logical_index(output_byte_index, i);
const int input_index = input_d.blk_off(input_logical_idx.n,
input_logical_idx.c, input_logical_idx.h,
input_logical_idx.w);
auto bit = uint8_t((input[input_index] > 0) ? 0x01 : 0x00);
bin_val |= (bit << i);
}
};

parallel_nd(dims[0], H, W,
[&](int n, int h, int w) {
auto iidx = input_d.blk_off(n, 0, h, w);
auto oidx = output_d.blk_off(n, 0, h, w);
output[output_byte_index] = bin_val;
};

auto i = &input[iidx];
auto o = &output[oidx / nbits];
ker(i, o);
const int output_size = utils::div_up(N * C * H * W, nbits);
parallel_nd(output_size, [&](int output_byte_index) {
ker(output_byte_index,
std::min(nbits,
(N * C * H * W) - output_byte_index * nbits));
});

return status::success;
}
};

//template <SIMPLE_REORDER_TEMPL_DECL>
//struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
//typename utils::enable_if<(tag_i == format_tag::nchw || tag_i == format_tag::nhwc) &&
// tag_o == format_tag::nhwc &&
// (type_i == dnnl_bin || type_o == dnnl_bin)>::type>
//{
// static bool is_applicable(const memory_desc_wrapper &input_d,
// const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
// return simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d)
// && simple_attr_check(attr, false, false);
// }
//
// GET_SCRATCHPAD_SIZE_ZERO();
//
// static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) {
// DECLARE_COMMON_PARAMS();
//
// const auto &dims = input_d.dims();
// const int C = dims[1];
// const int H = dims[2];
// const int W = dims[3];
//
// int nbits = 8;
// const int CB = utils::div_up(C, nbits);
//
// auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
// for (int cb = 0; cb < CB; ++cb) {
// uint8_t bin_val = 0x00;
// for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) {
// const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[1];
//
// auto bit = uint8_t((i[flat_off] > 0) ? 0x01 : 0x00);
// bin_val |= (bit << shift);
// }
//
// o[cb] = bin_val;
// }
// };
//
// parallel_nd(dims[0], H, W,
// [&](int n, int h, int w) {
// auto iidx = input_d.blk_off(n, 0, h, w);
// auto oidx = output_d.blk_off(n, 0, h, w);
//
// auto i = &input[iidx];
// auto o = &output[oidx / nbits];
// ker(i, o);
// });
//
// return status::success;
// }
//};

template <SIMPLE_REORDER_TEMPL_DECL>
struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
typename utils::enable_if<tag_i == format_tag::any &&
Expand Down