aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoralerah01 <alex.rahlis@arm.com>2022-01-31 19:04:10 +0200
committerAlex Rahlis <alex.rahlis@arm.com>2022-02-22 09:26:32 +0000
commitc9e519d2ea4780297d71e68cccc5de9c7bb7c0b4 (patch)
tree933427df414593e4a40d4c269c8065858a635d65
parent0597b7b55ba0c1aa2cc58e80f95dc918b92afe68 (diff)
downloadComputeLibrary-c9e519d2ea4780297d71e68cccc5de9c7bb7c0b4.tar.gz
Decouple CpuDirectConv2dKernel
Resolves COMPMID-4626 Exclude SVE & SVE2 paths from android.bp NDK version does not support these extensions. Change-Id: I49b147d2a84819975d3225f2920106fa1a0d742f Signed-off-by: alerah01 <alex.rahlis@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7136 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
-rw-r--r--Android.bp3
-rw-r--r--filelist.json11
-rw-r--r--src/cpu/kernels/CpuDirectConv2dKernel.cpp425
-rw-r--r--src/cpu/kernels/CpuDirectConv2dKernel.h22
-rw-r--r--src/cpu/kernels/CpuKernelSelectionTypes.h8
-rw-r--r--src/cpu/kernels/directconv2d/list.h47
-rw-r--r--src/cpu/kernels/directconv2d/nchw/all.cpp179
-rw-r--r--src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp39
-rw-r--r--src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp252
-rw-r--r--src/cpu/kernels/directconv2d/nhwc/neon/impl.h42
-rw-r--r--tests/validation/NEON/DirectConvolutionLayer.cpp37
11 files changed, 656 insertions, 409 deletions
diff --git a/Android.bp b/Android.bp
index db6e7faa26..3a49b8c362 100644
--- a/Android.bp
+++ b/Android.bp
@@ -448,6 +448,9 @@ cc_library_static {
"src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp",
+ "src/cpu/kernels/directconv2d/nchw/all.cpp",
+ "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp",
+ "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp",
"src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp",
"src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp",
"src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp",
diff --git a/filelist.json b/filelist.json
index 185ef6d43f..7e47df959c 100644
--- a/filelist.json
+++ b/filelist.json
@@ -848,7 +848,9 @@
},
"sve": {
"fp16": [ "src/cpu/kernels/activation/generic/sve/fp16.cpp" ],
- "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ],
+ "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ]
+ },
+ "sve2":{
"qasymm8": [ "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp" ],
"qasymm8_signed": [ "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp" ],
"qsymm16": [ "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp" ]
@@ -1052,7 +1054,12 @@
"src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
"src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
"src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
- "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp"
+ "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
+ "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp",
+ "src/cpu/kernels/directconv2d/nchw/all.cpp"
+ ],
+ "fp32": [
+ "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp"
]
}
}
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
index f3560156bd..a4cdddee5e 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -22,26 +22,14 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
+#include "src/cpu/kernels/directconv2d/list.h"
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include <algorithm>
-
using namespace arm_compute::detail;
namespace arm_compute
@@ -50,8 +38,25 @@ namespace cpu
{
namespace kernels
{
-namespace
+static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels =
{
+ {
+ "neon_fp32_nhwc_directconv2d",
+ [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
+ REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)
+ },
+ {
+ "neon_fp32_nchw_directconv2d",
+ [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
+ REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)
+ },
+ {
+ "neon_fp16_nchw_directconv2d",
+ [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)
+ },
+};
+
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -99,346 +104,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
return std::make_pair(err, win);
}
-bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
-{
- return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
-
-} // namespace
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
- // This function assumes that input and weights have not padding in channel
-
- // Declare useful types
- using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
- using vector_type = typename vtype::type;
- using tag_type = typename vtype::tag_type;
-
- // Scalar quantities
- const int element_size = src->info()->element_size();
- const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
- const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
- const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
- const int input_dim_w = src->info()->dimension(1);
- const int input_dim_h = src->info()->dimension(2);
-
- const int output_stride_c = dst->info()->strides_in_bytes().x();
-
- const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
- const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
- const int kernel_dim_w = weights->info()->dimension(1);
- const int kernel_dim_h = weights->info()->dimension(2);
-
- const int conv_pad_top = _conv_info.pad_top();
- const int conv_pad_left = _conv_info.pad_left();
- const int conv_stride_w = std::get<0>(_conv_info.stride());
- const int conv_stride_h = std::get<1>(_conv_info.stride());
-
- // Setup input window for the output iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Setup input window for the weights iterator
- Window window_w = calculate_max_window(*weights->info(), Steps());
- window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- Iterator out(dst, window_out);
- Iterator wei(weights, window_w);
-
- constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
- /*
- * This implementation parallelize the full WC plane of input and weights by
- * treating them as series of elements. So for example, a 3x3 weights and
- * floating point vector operations of 4 elements per time, the first 3
- * channel elements of the first row would be taken and additionally the first
- * element of the second row. The 9 elements in each single WC weight plane
- * would require 2 4-element vector operations and a last single element operation.
- *
- * This works since when we create the input vector to multiply with the weights,
- * the exact required elements are loaded in the same order. Therefore the
- * multiplication works on the correct input/weight elements.
- */
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- /*
- * In here we create theoretical indexes which then we validate for both
- * inputs and weights.
- * As a reminder, this loop take each output point in NHW, C is treated
- * in the weights loop.
- */
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
- const int index_h_start = in_h_start - in_h_start_t;
- const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
- const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- execute_window_loop(window_w, [&](const Coordinates & id_w)
- {
- /*
- * This is the loop in the weights, and it goes along N (the batches)
- * As a reminder, the batches of the weights are translated into the
- * channels of the output
- */
- const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
- + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
- const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
-
- T out_temp = static_cast<T>(0);
- for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
- {
- const T *in_ptr_mover = in_ptr_row;
- int index_wc = index_wc_start;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
- {
- const auto src_val = *(in_ptr_mover);
- const auto w_val = *(weights_ptr_row + index_wc);
- out_temp += src_val * w_val;
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
- },
- wei);
- },
- out);
-}
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
- // Declare useful types
- using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
- using vector_type = typename vtype::type;
- using tag_type = typename vtype::tag_type;
-
- // Scalar quantities
- const int element_size = src->info()->element_size();
- const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
- const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
- const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
- const int input_dim_w = src->info()->dimension(1);
- const int input_dim_h = src->info()->dimension(2);
-
- const int output_stride_c = dst->info()->strides_in_bytes().x();
-
- const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
- const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
- const int kernel_dim_w = weights->info()->dimension(1);
- const int kernel_dim_h = weights->info()->dimension(2);
-
- const int conv_pad_top = _conv_info.pad_top();
- const int conv_pad_left = _conv_info.pad_left();
- const int conv_stride_w = std::get<0>(_conv_info.stride());
- const int conv_stride_h = std::get<1>(_conv_info.stride());
-
- // Setup input window for the output iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Setup input window for the weights iterator
- Window window_w = calculate_max_window(*weights->info(), Steps());
- window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- Iterator out(dst, window_out);
- Iterator wei(weights, window_w);
-
- constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int wei_w_start = in_w_start - in_w_start_t;
- const int wei_h_start = in_h_start - in_h_start_t;
- const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
- const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- const int index_c_end = weights->info()->dimension(0);
- const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
- execute_window_loop(window_w, [&](const Coordinates & id_w)
- {
- const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
-
- T out_temp = static_cast<T>(0);
- for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
- {
- const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
- const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
- for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
- {
- const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
- const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
- int index_c = 0;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- const auto w_vec = wrapper::vloadq(weights_ptr_mover);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
- {
- const auto src_val = *(in_ptr_mover);
- const auto w_val = *(weights_ptr_mover);
- out_temp += src_val * w_val;
- }
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
- },
- wei);
- },
- out);
-}
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
- // Declare useful types
- using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
- using vector_type = typename vtype::type;
- using tag_type = typename vtype::tag_type;
-
- // Scalar quantities
- const int element_size = src->info()->element_size();
- const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
- const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
- const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
- const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-
- const int input_dim_w = src->info()->dimension(0);
- const int input_dim_h = src->info()->dimension(1);
-
- const int output_stride_c = dst->info()->strides_in_bytes()[2];
-
- const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
- const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
- const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
-
- const int kernel_dim_w = weights->info()->dimension(0);
- const int kernel_dim_h = weights->info()->dimension(1);
-
- const int conv_pad_top = _conv_info.pad_top();
- const int conv_pad_left = _conv_info.pad_left();
- const int conv_stride_w = std::get<0>(_conv_info.stride());
- const int conv_stride_h = std::get<1>(_conv_info.stride());
-
- // Setup input window for the output iterator
- Window window_out = window;
- window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- // Setup input window for the weights iterator
- Window window_w = calculate_max_window(*weights->info(), Steps());
- window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- Iterator out(dst, window_out);
- Iterator wei(weights, window_w);
-
- constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int wei_w_start = in_w_start - in_w_start_t;
- const int wei_h_start = in_h_start - in_h_start_t;
- const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- const int index_c_end = weights->info()->dimension(2);
- const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
- execute_window_loop(window_w, [&](const Coordinates & id_w)
- {
- const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
- T out_temp = static_cast<T>(0);
-
- for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
- {
- const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c;
- const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
- for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
- {
- const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h;
- const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
- int index_w = in_w_start;
- int index_wei_w = wei_w_start;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
- const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_w < in_w_end; ++index_w, ++index_wei_w)
- {
- const auto src_val = *(in_ptr_row + index_w * input_stride_w);
- const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w);
- out_temp += src_val * w_val;
- }
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-
- },
- wei);
- },
- out);
-}
-
void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -484,53 +149,21 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_data_layout == DataLayout::NCHW)
- {
- switch(src->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- convolve_nchw<float16_t>(window, src, weights, dst);
- break;
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- {
- convolve_nchw<float>(window, src, weights, dst);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
- }
- else
- {
- switch(src->info()->data_type())
- {
- case DataType::F32:
- {
- if(have_zero_x_internal_padding(src->info(), weights->info()))
- {
- convolve_nhwc_optimized<float>(window, src, weights, dst);
- }
- else
- {
- convolve_nhwc<float>(window, src, weights, dst);
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
- }
+ const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+ uk->ukernel(window, src, weights, dst, _conv_info);
}
const char *CpuDirectConv2dKernel::name() const
{
return "CpuDirectConvolutionLayerKernel";
}
+
+const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> &CpuDirectConv2dKernel::get_available_kernels()
+{
+ return available_kernels;
+}
+
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index 6ec4d4ee04..b9265dc630 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -36,6 +36,9 @@ namespace kernels
/** Interface for the kernel to perform Direct Convolution Layer. */
class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
{
+private:
+ using DirectConv2dKernel_Ptr = std::add_pointer<void(const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
+
public:
CpuDirectConv2dKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
@@ -67,19 +70,16 @@ public:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
-private:
- /* Template function for optimized convolution NHWC */
- template <typename T>
- void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
+ struct DirectConv2dKernel
+ {
+ const char *name;
+ const DataTypeDataLayoutSelectorPtr is_selected;
+ DirectConv2dKernel_Ptr ukernel;
+ };
- /* Template function for convolution NHWC */
- template <typename T>
- void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
- /* Template function for convolution NCHW */
- template <typename T>
- void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
+ static const std::vector<DirectConv2dKernel> &get_available_kernels();
+private:
PadStrideInfo _conv_info{};
unsigned int _kernel_size{ 0 };
DataLayout _data_layout{ DataLayout::UNKNOWN };
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 4a0ebd6e3f..8c5a39ad49 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -40,6 +40,13 @@ struct DataTypeISASelectorData
cpuinfo::CpuIsaInfo isa;
};
+struct DataTypeDataLayoutISASelectorData
+{
+ DataType dt;
+ DataLayout dl;
+ const cpuinfo::CpuIsaInfo &isa;
+};
+
struct PoolDataTypeISASelectorData
{
DataType dt;
@@ -63,6 +70,7 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData
};
// Selector pointer types
using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
using PoolDataTypeISASelectorPtr = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h
new file mode 100644
index 0000000000..9a0472643d
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/list.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
+#define SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
+
+#include "src/core/common/Registrars.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \
+ void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d);
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d);
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nchw_directconv2d);
+
+#undef DECLARE_DIRECT_CONV2D_KERNEL
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp
new file mode 100644
index 0000000000..a719fa50d6
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nchw/all.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+ convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+ convolve_nchw<float>(window, src, weights, dst, conv_info);
+}
+
+template <typename T>
+void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_UNUSED(conv_info);
+
+ // Declare useful types
+ using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+ using vector_type = typename vtype::type;
+ using tag_type = typename vtype::tag_type;
+
+ // Scalar quantities
+ const int element_size = src->info()->element_size();
+ const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
+ const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
+ const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
+ const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+
+ const int input_dim_w = src->info()->dimension(0);
+ const int input_dim_h = src->info()->dimension(1);
+
+ const int output_stride_c = dst->info()->strides_in_bytes()[2];
+
+ const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
+ const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
+ const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
+
+ const int kernel_dim_w = weights->info()->dimension(0);
+ const int kernel_dim_h = weights->info()->dimension(1);
+
+ const int conv_pad_top = conv_info.pad_top();
+ const int conv_pad_left = conv_info.pad_left();
+ const int conv_stride_w = std::get<0>(conv_info.stride());
+ const int conv_stride_h = std::get<1>(conv_info.stride());
+
+ // Setup input window for the output iterator
+ Window window_out = window;
+ window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ // Setup input window for the weights iterator
+ Window window_w = calculate_max_window(*weights->info(), Steps());
+ window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+ window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ Iterator out(dst, window_out);
+ Iterator wei(weights, window_w);
+
+ constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ // We are computing the theoretical starting input starting points
+ const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+
+ // We use the input points to select the valid weight points to use
+ const int wei_w_start = in_w_start - in_w_start_t;
+ const int wei_h_start = in_h_start - in_h_start_t;
+ const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+
+ const int index_c_end = weights->info()->dimension(2);
+ const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+ execute_window_loop(window_w, [&](const Coordinates & id_w)
+ {
+ const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+ uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+ T out_temp = static_cast<T>(0);
+
+ for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
+ {
+ const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c;
+ const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+ for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+ {
+ const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h;
+ const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+ int index_w = in_w_start;
+ int index_wei_w = wei_w_start;
+ vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+ const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+ out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+ }
+ out_temp += vreduce(out_temp_vec);
+ for(; index_w < in_w_end; ++index_w, ++index_wei_w)
+ {
+ const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+ const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+ out_temp += src_val * w_val;
+ }
+ }
+ }
+ *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+
+ },
+ wei);
+ },
+ out);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template void convolve_nchw<float16_t>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template void convolve_nchw<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
new file mode 100644
index 0000000000..9982431de5
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+ convolve_nhwc<float>(window, src, weights, dst, conv_info);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
new file mode 100644
index 0000000000..500ad1b420
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <algorithm>
+
+using namespace arm_compute::detail;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
+{
+ return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
+}
+}
+
+template <typename T>
+void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+ // Declare useful types
+ using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+ using vector_type = typename vtype::type;
+ using tag_type = typename vtype::tag_type;
+
+ // Scalar quantities
+ const int element_size = src->info()->element_size();
+ const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+ const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+ const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+ const int input_dim_w = src->info()->dimension(1);
+ const int input_dim_h = src->info()->dimension(2);
+
+ const int output_stride_c = dst->info()->strides_in_bytes().x();
+
+ const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
+ const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
+ const int kernel_dim_w = weights->info()->dimension(1);
+ const int kernel_dim_h = weights->info()->dimension(2);
+
+ const int conv_pad_top = conv_info.pad_top();
+ const int conv_pad_left = conv_info.pad_left();
+ const int conv_stride_w = std::get<0>(conv_info.stride());
+ const int conv_stride_h = std::get<1>(conv_info.stride());
+
+ // Setup input window for the output iterator
+ Window window_out = window;
+ window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Setup input window for the weights iterator
+ Window window_w = calculate_max_window(*weights->info(), Steps());
+ window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+ window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ Iterator out(dst, window_out);
+ Iterator wei(weights, window_w);
+
+ constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+ // nhwc optimized
+ if(have_zero_x_internal_padding(src->info(), weights->info()))
+ {
+ // This function assumes that input and weights have not padding in channel
+
+ /*
+ * This implementation parallelize the full WC plane of input and weights by
+ * treating them as series of elements. So for example, a 3x3 weights and
+ * floating point vector operations of 4 elements per time, the first 3
+ * channel elements of the first row would be taken and additionally the first
+ * element of the second row. The 9 elements in each single WC weight plane
+ * would require 2 4-element vector operations and a last single element operation.
+ *
+ * This works since when we create the input vector to multiply with the weights,
+ * the exact required elements are loaded in the same order. Therefore the
+ * multiplication works on the correct input/weight elements.
+ */
+ execute_window_loop(
+ window_out, [&](const Coordinates & id)
+ {
+ /*
+ * In here we create theoretical indexes which then we validate for both
+ * inputs and weights.
+ * As a reminder, this loop take each output point in NHW, C is treated
+ * in the weights loop.
+ */
+ // We are computing the theoretical starting input starting points
+ const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+
+ // We use the input points to select the valid weight points to use
+ const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+ const int index_h_start = in_h_start - in_h_start_t;
+ const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+ const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+
+ execute_window_loop(
+ window_w, [&](const Coordinates & id_w)
+ {
+ /*
+ * This is the loop in the weights, and it goes along N (the batches)
+ * As a reminder, the batches of the weights are translated into the
+ * channels of the output
+ */
+ const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
+ + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+ const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+ uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+
+ T out_temp = static_cast<T>(0);
+ for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+ {
+ const T *in_ptr_mover = in_ptr_row;
+ int index_wc = index_wc_start;
+ vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_mover);
+ const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);
+ out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+ }
+ out_temp += vreduce(out_temp_vec);
+ for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+ {
+ const auto src_val = *(in_ptr_mover);
+ const auto w_val = *(weights_ptr_row + index_wc);
+ out_temp += src_val * w_val;
+ }
+ }
+ *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+ },
+ wei);
+ },
+ out);
+ }
+ else // nhwc non optimized
+ {
+ execute_window_loop(
+ window_out, [&](const Coordinates & id)
+ {
+ // We are computing the theoretical starting input starting points
+ const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+
+ // We use the input points to select the valid weight points to use
+ const int wei_w_start = in_w_start - in_w_start_t;
+ const int wei_h_start = in_h_start - in_h_start_t;
+ const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
+ const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+
+ const int index_c_end = weights->info()->dimension(0);
+ const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+
+ execute_window_loop(
+ window_w, [&](const Coordinates & id_w)
+ {
+ const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+ uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+
+ T out_temp = static_cast<T>(0);
+ for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+ {
+ const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
+ const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+ for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+ {
+ const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
+ const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+ int index_c = 0;
+ vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_mover);
+ const auto w_vec = wrapper::vloadq(weights_ptr_mover);
+ out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+ }
+ out_temp += vreduce(out_temp_vec);
+ for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+ {
+ const auto src_val = *(in_ptr_mover);
+ const auto w_val = *(weights_ptr_mover);
+ out_temp += src_val * w_val;
+ }
+ }
+ }
+ *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+ },
+ wei);
+ },
+ out);
+ }
+}
+
+template void convolve_nhwc<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
new file mode 100644
index 0000000000..88a151fba4
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
+#define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
+
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif //SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 824741db5f..0f4c6bb279 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -26,6 +26,8 @@
#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
#include "tests/NEON/Accessor.h"
#include "tests/PaddingCalculator.h"
#include "tests/datasets/ShapeDatasets.h"
@@ -180,6 +182,41 @@ TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT)
validate(Accessor(dst), ref_dst);
}
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+ concat(combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+ framework::dataset::make("DataType", { DataType::F32 })),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+ framework::dataset::make("DataType", { DataType::F16 })),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW }))),
+ cpu_ext, data_type, data_layout)
+{
+ using namespace cpu::kernels;
+
+ cpuinfo::CpuIsaInfo cpu_isa{};
+ cpu_isa.neon = (cpu_ext == "NEON");
+ cpu_isa.fp16 = (data_type == DataType::F16);
+
+ const auto *selected_impl = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ data_type, data_layout, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+ std::string data_layout_str;
+ if(data_layout == DataLayout::NCHW)
+ {
+ data_layout_str = "nchw";
+ }
+ else
+ {
+ data_layout_str = "nhwc";
+ }
+
+ std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_" + data_layout_str + "_directconv2d";
+ std::string actual = selected_impl->name;
+
+ ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
// *INDENT-OFF*
// clang-format off
DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(