diff options
Diffstat (limited to 'src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h')
-rw-r--r-- | src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h | 279 |
1 files changed, 166 insertions, 113 deletions
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h index 8410cdbf16..3fa5c58c3c 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h @@ -24,6 +24,7 @@ #ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H #define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -63,15 +64,21 @@ struct DepthwiseConvolutionRunInfo const size_t input_width; const size_t input_depth; - DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT - : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), + DepthwiseConvolutionRunInfo(const ITensorInfo &input, + const ITensorInfo &weights, + const PadStrideInfo &conv_info, + const Window &w, + uint32_t depth_multiplier = 1) // NOLINT + : num_read_elements_per_iteration( + (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), x_start(w.x().start()), x_end(w.x().end()), x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)), x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))), input_stride_y(input.strides_in_bytes().y()), input_stride_z(input.strides_in_bytes().z()), - input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), + input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - + (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), weights_width(weights.dimension(width_idx)), weights_height(weights.dimension(height_idx)), weights_stride_y(weights.strides_in_bytes().y()), @@ -87,7 +94,12 @@ struct DepthwiseConvolutionRunInfo } }; -inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation) +inline bool is_valid_input_region(int32_t base_w, + uint32_t base_h, + uint32_t w, + uint32_t h, + const DepthwiseConvolutionRunInfo &run_info, + const Size2D &dilation) { const int32_t current_h = base_h + h * dilation.y(); const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height); @@ -99,8 +111,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u } template <typename T> -void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, const Window &window, bool has_biases) +void depthwise_loop_multiplier1_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const Window &window, + bool has_biases) { constexpr auto element_per_vector = vector_size / sizeof(T); using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type; @@ -129,94 +147,112 @@ void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, c Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - - auto const base_weights_ptr = weights_it.ptr(); - uint32_t x = run_info.x_start; - - for(; x < run_info.x_leftover_start; x += run_info.x_step) + execute_window_loop( + execution_window, + [&](const Coordinates &id) { - VectorType acc = zero_vector; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto const base_weights_ptr = weights_it.ptr(); + uint32_t x = run_info.x_start; - for(uint32_t h = 0; h < run_info.weights_height; ++h) + for (; x < run_info.x_leftover_start; x += run_info.x_step) { - int64_t offs = input_offset + x * sizeof(T); - for(uint32_t w = 0; w < run_info.weights_width; ++w) + VectorType acc = zero_vector; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; + + for (uint32_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? - wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : - zero_vector; - const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); - acc = wrapper::vmla(acc, weights_vals, input_vals); + int64_t offs = input_offset + x * sizeof(T); + for (uint32_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? wrapper::vload(reinterpret_cast<T *>( + input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) + : zero_vector; + const auto weights_vals = + wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + acc = wrapper::vmla(acc, weights_vals, input_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } - offs += dilation.x() * run_info.input_stride_y; + if (has_biases) + { + const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc = wrapper::vadd(acc, biases_vals); } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; + wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc); } - if(has_biases) + for (; x < run_info.x_end; ++x) { - const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x); - acc = wrapper::vadd(acc, biases_vals); - } - - wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc); - } + auto acc_scalar = T{0}; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; - for(; x < run_info.x_end; ++x) - { - auto acc_scalar = T{ 0 }; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0; - const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); - - acc_scalar += (input_vals * weights_vals); - - offs += dilation.x() * run_info.input_stride_y; + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? *reinterpret_cast<T *>(input_it.ptr() + + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) + : 0; + const auto weights_vals = + *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + + acc_scalar += (input_vals * weights_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x); - acc_scalar += biases_vals; + if (has_biases) + { + const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc_scalar += biases_vals; + } + *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar; } - *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar; - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template <typename T> -void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases) +void depthwise_loop_generic_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + const Window &window, + bool has_biases) { - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); Window execution_window = window; execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); @@ -240,81 +276,98 @@ void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::vector<T> acc(depth_multiplier, static_cast<T>(0)); + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::vector<T> acc(depth_multiplier, static_cast<T>(0)); - const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0); - - for(size_t m = 0; m < depth_multiplier; ++m) + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) { - const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), + run_info.input_max_offset))) + : T(0); + + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = + *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + } + + offs += dilation.x() * run_info.input_stride_y; } - offs += dilation.x() * run_info.input_stride_y; + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - for(size_t m = 0; m < depth_multiplier; ++m) + if (has_biases) { - const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T))); - *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T))); + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + } } - } - else - { - for(size_t m = 0; m < depth_multiplier; ++m) + else { - *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m); + for (size_t m = 0; m < depth_multiplier; ++m) + { + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m); + } } - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template <typename T, typename TW> -void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void run_depthwise_float(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { PadStrideInfo conv_info = info.pad_stride_info; unsigned int depth_multiplier = info.depth_multiplier; Size2D dilation = info.dilation; - if(depth_multiplier == 1) + if (depth_multiplier == 1) { depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases); } else { - depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases); + depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, + has_biases); } } template <typename T, typename TW> -void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); +void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); } // namespace cpu } // namespace arm_compute |