From c9e519d2ea4780297d71e68cccc5de9c7bb7c0b4 Mon Sep 17 00:00:00 2001 From: alerah01 Date: Mon, 31 Jan 2022 19:04:10 +0200 Subject: Decouple CpuDirectConv2dKernel Resolves COMPMID-4626 Exclude SVE & SVE2 paths from android.bp NDK version does not support these extensions. Change-Id: I49b147d2a84819975d3225f2920106fa1a0d742f Signed-off-by: alerah01 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7136 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena --- Android.bp | 3 + filelist.json | 11 +- src/cpu/kernels/CpuDirectConv2dKernel.cpp | 425 ++--------------------- src/cpu/kernels/CpuDirectConv2dKernel.h | 22 +- src/cpu/kernels/CpuKernelSelectionTypes.h | 8 + src/cpu/kernels/directconv2d/list.h | 47 +++ src/cpu/kernels/directconv2d/nchw/all.cpp | 179 ++++++++++ src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp | 39 +++ src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp | 252 ++++++++++++++ src/cpu/kernels/directconv2d/nhwc/neon/impl.h | 42 +++ tests/validation/NEON/DirectConvolutionLayer.cpp | 37 ++ 11 files changed, 656 insertions(+), 409 deletions(-) create mode 100644 src/cpu/kernels/directconv2d/list.h create mode 100644 src/cpu/kernels/directconv2d/nchw/all.cpp create mode 100644 src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp create mode 100644 src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp create mode 100644 src/cpu/kernels/directconv2d/nhwc/neon/impl.h diff --git a/Android.bp b/Android.bp index db6e7faa26..3a49b8c362 100644 --- a/Android.bp +++ b/Android.bp @@ -448,6 +448,9 @@ cc_library_static { "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp", + "src/cpu/kernels/directconv2d/nchw/all.cpp", + "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp", + "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp", "src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp", "src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp", "src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp", diff --git a/filelist.json b/filelist.json index 185ef6d43f..7e47df959c 100644 --- a/filelist.json +++ b/filelist.json @@ -848,7 +848,9 @@ }, "sve": { "fp16": [ "src/cpu/kernels/activation/generic/sve/fp16.cpp" ], - "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ], + "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ] + }, + "sve2":{ "qasymm8": [ "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp" ], "qasymm8_signed": [ "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp" ], "qsymm16": [ "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp" ] @@ -1052,7 +1054,12 @@ "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp", "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp", "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp" + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp", + "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp", + "src/cpu/kernels/directconv2d/nchw/all.cpp" + ], + "fp32": [ + "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp" ] } } diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp index f3560156bd..a4cdddee5e 100644 --- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp @@ -22,26 +22,14 @@ * SOFTWARE. */ #include "src/cpu/kernels/CpuDirectConv2dKernel.h" +#include "src/cpu/kernels/directconv2d/list.h" -#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include - using namespace arm_compute::detail; namespace arm_compute @@ -50,8 +38,25 @@ namespace cpu { namespace kernels { -namespace +static const std::vector available_kernels = { + { + "neon_fp32_nhwc_directconv2d", + [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d) + }, + { + "neon_fp32_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d) + }, + { + "neon_fp16_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d) + }, +}; + Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); @@ -99,346 +104,6 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso return std::make_pair(err, win); } -bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) -{ - return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0); -} - -} // namespace - -template -void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) -{ - // This function assumes that input and weights have not padding in channel - - // Declare useful types - using vtype = wrapper::traits::neon_bitvector; - using vector_type = typename vtype::type; - using tag_type = typename vtype::tag_type; - - // Scalar quantities - const int element_size = src->info()->element_size(); - const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; - const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; - const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; - const int input_dim_w = src->info()->dimension(1); - const int input_dim_h = src->info()->dimension(2); - - const int output_stride_c = dst->info()->strides_in_bytes().x(); - - const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; - const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; - const int kernel_dim_w = weights->info()->dimension(1); - const int kernel_dim_h = weights->info()->dimension(2); - - const int conv_pad_top = _conv_info.pad_top(); - const int conv_pad_left = _conv_info.pad_left(); - const int conv_stride_w = std::get<0>(_conv_info.stride()); - const int conv_stride_h = std::get<1>(_conv_info.stride()); - - // Setup input window for the output iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Setup input window for the weights iterator - Window window_w = calculate_max_window(*weights->info(), Steps()); - window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - Iterator out(dst, window_out); - Iterator wei(weights, window_w); - - constexpr int num_elems_read_per_iteration = 16 / sizeof(T); - /* - * This implementation parallelize the full WC plane of input and weights by - * treating them as series of elements. So for example, a 3x3 weights and - * floating point vector operations of 4 elements per time, the first 3 - * channel elements of the first row would be taken and additionally the first - * element of the second row. The 9 elements in each single WC weight plane - * would require 2 4-element vector operations and a last single element operation. - * - * This works since when we create the input vector to multiply with the weights, - * the exact required elements are loaded in the same order. Therefore the - * multiplication works on the correct input/weight elements. - */ - execute_window_loop(window_out, [&](const Coordinates & id) - { - /* - * In here we create theoretical indexes which then we validate for both - * inputs and weights. - * As a reminder, this loop take each output point in NHW, C is treated - * in the weights loop. - */ - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; - const int index_h_start = in_h_start - in_h_start_t; - const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; - const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - execute_window_loop(window_w, [&](const Coordinates & id_w) - { - /* - * This is the loop in the weights, and it goes along N (the batches) - * As a reminder, the batches of the weights are translated into the - * channels of the output - */ - const T *in_ptr_row = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) - + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; - const T *weights_ptr_row = reinterpret_cast(wei.ptr()) + index_h_start * kernel_stride_h; - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast(0); - for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) - { - const T *in_ptr_mover = in_ptr_row; - int index_wc = index_wc_start; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_row + index_wc); - out_temp += src_val * w_val; - } - } - *(reinterpret_cast(out_ptr)) = out_temp; - }, - wei); - }, - out); -} - -template -void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) -{ - // Declare useful types - using vtype = wrapper::traits::neon_bitvector; - using vector_type = typename vtype::type; - using tag_type = typename vtype::tag_type; - - // Scalar quantities - const int element_size = src->info()->element_size(); - const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; - const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; - const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; - const int input_dim_w = src->info()->dimension(1); - const int input_dim_h = src->info()->dimension(2); - - const int output_stride_c = dst->info()->strides_in_bytes().x(); - - const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; - const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; - const int kernel_dim_w = weights->info()->dimension(1); - const int kernel_dim_h = weights->info()->dimension(2); - - const int conv_pad_top = _conv_info.pad_top(); - const int conv_pad_left = _conv_info.pad_left(); - const int conv_stride_w = std::get<0>(_conv_info.stride()); - const int conv_stride_h = std::get<1>(_conv_info.stride()); - - // Setup input window for the output iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Setup input window for the weights iterator - Window window_w = calculate_max_window(*weights->info(), Steps()); - window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - Iterator out(dst, window_out); - Iterator wei(weights, window_w); - - constexpr int num_elems_read_per_iteration = 16 / sizeof(T); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - const int index_c_end = weights->info()->dimension(0); - const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; - - execute_window_loop(window_w, [&](const Coordinates & id_w) - { - const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast(0); - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) - { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c = 0; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_mover); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_mover); - out_temp += src_val * w_val; - } - } - } - *(reinterpret_cast(out_ptr)) = out_temp; - }, - wei); - }, - out); -} - -template -void CpuDirectConv2dKernel::convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) -{ - // Declare useful types - using vtype = wrapper::traits::neon_bitvector; - using vector_type = typename vtype::type; - using tag_type = typename vtype::tag_type; - - // Scalar quantities - const int element_size = src->info()->element_size(); - const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size; - const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size; - const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size; - const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; - - const int input_dim_w = src->info()->dimension(0); - const int input_dim_h = src->info()->dimension(1); - - const int output_stride_c = dst->info()->strides_in_bytes()[2]; - - const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size; - const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size; - const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size; - - const int kernel_dim_w = weights->info()->dimension(0); - const int kernel_dim_h = weights->info()->dimension(1); - - const int conv_pad_top = _conv_info.pad_top(); - const int conv_pad_left = _conv_info.pad_left(); - const int conv_stride_w = std::get<0>(_conv_info.stride()); - const int conv_stride_h = std::get<1>(_conv_info.stride()); - - // Setup input window for the output iterator - Window window_out = window; - window_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - // Setup input window for the weights iterator - Window window_w = calculate_max_window(*weights->info(), Steps()); - window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - Iterator out(dst, window_out); - Iterator wei(weights, window_w); - - constexpr int num_elems_read_per_iteration = 16 / sizeof(T); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.x()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.y()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - const int index_c_end = weights->info()->dimension(2); - const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; - execute_window_loop(window_w, [&](const Coordinates & id_w) - { - const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - T out_temp = static_cast(0); - - for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) - { - const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; - const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; - const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; - int index_w = in_w_start; - int index_wei_w = wei_w_start; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); - const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_w < in_w_end; ++index_w, ++index_wei_w) - { - const auto src_val = *(in_ptr_row + index_w * input_stride_w); - const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); - out_temp += src_val * w_val; - } - } - } - *(reinterpret_cast(out_ptr)) = out_temp; - - }, - wei); - }, - out); -} - void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -484,53 +149,21 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_data_layout == DataLayout::NCHW) - { - switch(src->info()->data_type()) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - convolve_nchw(window, src, weights, dst); - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - { - convolve_nchw(window, src, weights, dst); - break; - } - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - } - else - { - switch(src->info()->data_type()) - { - case DataType::F32: - { - if(have_zero_x_internal_padding(src->info(), weights->info())) - { - convolve_nhwc_optimized(window, src, weights, dst); - } - else - { - convolve_nhwc(window, src, weights, dst); - } - break; - } - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - } + const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() }); + ARM_COMPUTE_ERROR_ON(uk == nullptr); + + uk->ukernel(window, src, weights, dst, _conv_info); } const char *CpuDirectConv2dKernel::name() const { return "CpuDirectConvolutionLayerKernel"; } + +const std::vector &CpuDirectConv2dKernel::get_available_kernels() +{ + return available_kernels; +} + } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h index 6ec4d4ee04..b9265dc630 100644 --- a/src/cpu/kernels/CpuDirectConv2dKernel.h +++ b/src/cpu/kernels/CpuDirectConv2dKernel.h @@ -36,6 +36,9 @@ namespace kernels /** Interface for the kernel to perform Direct Convolution Layer. */ class CpuDirectConv2dKernel : public ICpuKernel { +private: + using DirectConv2dKernel_Ptr = std::add_pointer::type; + public: CpuDirectConv2dKernel() = default; ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel); @@ -67,19 +70,16 @@ public: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; -private: - /* Template function for optimized convolution NHWC */ - template - void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); + struct DirectConv2dKernel + { + const char *name; + const DataTypeDataLayoutSelectorPtr is_selected; + DirectConv2dKernel_Ptr ukernel; + }; - /* Template function for convolution NHWC */ - template - void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); - - /* Template function for convolution NCHW */ - template - void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); + static const std::vector &get_available_kernels(); +private: PadStrideInfo _conv_info{}; unsigned int _kernel_size{ 0 }; DataLayout _data_layout{ DataLayout::UNKNOWN }; diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 4a0ebd6e3f..8c5a39ad49 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -40,6 +40,13 @@ struct DataTypeISASelectorData cpuinfo::CpuIsaInfo isa; }; +struct DataTypeDataLayoutISASelectorData +{ + DataType dt; + DataLayout dl; + const cpuinfo::CpuIsaInfo &isa; +}; + struct PoolDataTypeISASelectorData { DataType dt; @@ -63,6 +70,7 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData }; // Selector pointer types using DataTypeISASelectorPtr = std::add_pointer::type; +using DataTypeDataLayoutSelectorPtr = std::add_pointer::type; using PoolDataTypeISASelectorPtr = std::add_pointer::type; using ElementwiseDataTypeISASelectorPtr = std::add_pointer::type; using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer::type; diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h new file mode 100644 index 0000000000..9a0472643d --- /dev/null +++ b/src/cpu/kernels/directconv2d/list.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_CONV2D_LIST_H +#define SRC_CORE_NEON_KERNELS_CONV2D_LIST_H + +#include "src/core/common/Registrars.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) + +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d); +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d); +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nchw_directconv2d); + +#undef DECLARE_DIRECT_CONV2D_KERNEL + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_CONV2D_LIST_H diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp new file mode 100644 index 0000000000..a719fa50d6 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nchw/all.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "src/core/helpers/WindowHelpers.h" + +#include + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template +void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nchw(window, src, weights, dst, conv_info); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nchw(window, src, weights, dst, conv_info); +} + +template +void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_UNUSED(conv_info); + + // Declare useful types + using vtype = wrapper::traits::neon_bitvector; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size; + const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size; + const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + + const int input_dim_w = src->info()->dimension(0); + const int input_dim_h = src->info()->dimension(1); + + const int output_stride_c = dst->info()->strides_in_bytes()[2]; + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size; + + const int kernel_dim_w = weights->info()->dimension(0); + const int kernel_dim_h = weights->info()->dimension(1); + + const int conv_pad_top = conv_info.pad_top(); + const int conv_pad_left = conv_info.pad_left(); + const int conv_stride_w = std::get<0>(conv_info.stride()); + const int conv_stride_h = std::get<1>(conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast(id.x()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.y()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(2); + const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; + execute_window_loop(window_w, [&](const Coordinates & id_w) + { + const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + T out_temp = static_cast(0); + + for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) + { + const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; + const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; + for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) + { + const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; + const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; + int index_w = in_w_start; + int index_wei_w = wei_w_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); + for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for(; index_w < in_w_end; ++index_w, ++index_wei_w) + { + const auto src_val = *(in_ptr_row + index_w * input_stride_w); + const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp += src_val * w_val; + } + } + } + *(reinterpret_cast(out_ptr)) = out_temp; + + }, + wei); + }, + out); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp new file mode 100644 index 0000000000..9982431de5 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nhwc(window, src, weights, dst, conv_info); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp new file mode 100644 index 0000000000..500ad1b420 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "src/core/helpers/WindowHelpers.h" + +#include + +using namespace arm_compute::detail; + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) +{ + return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0); +} +} + +template +void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + // Declare useful types + using vtype = wrapper::traits::neon_bitvector; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + + const int output_stride_c = dst->info()->strides_in_bytes().x(); + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; + const int kernel_dim_w = weights->info()->dimension(1); + const int kernel_dim_h = weights->info()->dimension(2); + + const int conv_pad_top = conv_info.pad_top(); + const int conv_pad_left = conv_info.pad_left(); + const int conv_stride_w = std::get<0>(conv_info.stride()); + const int conv_stride_h = std::get<1>(conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + // nhwc optimized + if(have_zero_x_internal_padding(src->info(), weights->info())) + { + // This function assumes that input and weights have not padding in channel + + /* + * This implementation parallelize the full WC plane of input and weights by + * treating them as series of elements. So for example, a 3x3 weights and + * floating point vector operations of 4 elements per time, the first 3 + * channel elements of the first row would be taken and additionally the first + * element of the second row. The 9 elements in each single WC weight plane + * would require 2 4-element vector operations and a last single element operation. + * + * This works since when we create the input vector to multiply with the weights, + * the exact required elements are loaded in the same order. Therefore the + * multiplication works on the correct input/weight elements. + */ + execute_window_loop( + window_out, [&](const Coordinates & id) + { + /* + * In here we create theoretical indexes which then we validate for both + * inputs and weights. + * As a reminder, this loop take each output point in NHW, C is treated + * in the weights loop. + */ + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; + const int index_h_start = in_h_start - in_h_start_t; + const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; + const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + execute_window_loop( + window_w, [&](const Coordinates & id_w) + { + /* + * This is the loop in the weights, and it goes along N (the batches) + * As a reminder, the batches of the weights are translated into the + * channels of the output + */ + const T *in_ptr_row = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; + const T *weights_ptr_row = reinterpret_cast(wei.ptr()) + index_h_start * kernel_stride_h; + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast(0); + for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) + { + const T *in_ptr_mover = in_ptr_row; + int index_wc = index_wc_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); + for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_row + index_wc); + out_temp += src_val * w_val; + } + } + *(reinterpret_cast(out_ptr)) = out_temp; + }, + wei); + }, + out); + } + else // nhwc non optimized + { + execute_window_loop( + window_out, [&](const Coordinates & id) + { + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(0); + const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; + + execute_window_loop( + window_w, [&](const Coordinates & id_w) + { + const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast(0); + for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) + { + const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; + for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); + for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_mover); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } + } + } + *(reinterpret_cast(out_ptr)) = out_temp; + }, + wei); + }, + out); + } +} + +template void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h new file mode 100644 index 0000000000..88a151fba4 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H +#define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H + +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template +void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif //SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp index 824741db5f..0f4c6bb279 100644 --- a/tests/validation/NEON/DirectConvolutionLayer.cpp +++ b/tests/validation/NEON/DirectConvolutionLayer.cpp @@ -26,6 +26,8 @@ #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" +#include "src/common/cpuinfo/CpuIsaInfo.h" +#include "src/cpu/kernels/CpuDirectConv2dKernel.h" #include "tests/NEON/Accessor.h" #include "tests/PaddingCalculator.h" #include "tests/datasets/ShapeDatasets.h" @@ -180,6 +182,41 @@ TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT) validate(Accessor(dst), ref_dst); } +DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, + concat(combine(combine(framework::dataset::make("CpuExt", std::string("NEON")), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + combine(combine(framework::dataset::make("CpuExt", std::string("NEON")), + framework::dataset::make("DataType", { DataType::F16 })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))), + cpu_ext, data_type, data_layout) +{ + using namespace cpu::kernels; + + cpuinfo::CpuIsaInfo cpu_isa{}; + cpu_isa.neon = (cpu_ext == "NEON"); + cpu_isa.fp16 = (data_type == DataType::F16); + + const auto *selected_impl = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ data_type, data_layout, cpu_isa }, cpu::KernelSelectionType::Preferred); + + ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); + + std::string data_layout_str; + if(data_layout == DataLayout::NCHW) + { + data_layout_str = "nchw"; + } + else + { + data_layout_str = "nhwc"; + } + + std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_" + data_layout_str + "_directconv2d"; + std::string actual = selected_impl->name; + + ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip( -- cgit v1.2.1