From c9e519d2ea4780297d71e68cccc5de9c7bb7c0b4 Mon Sep 17 00:00:00 2001
From: alerah01 <alex.rahlis@arm.com>
Date: Mon, 31 Jan 2022 19:04:10 +0200
Subject: Decouple CpuDirectConv2dKernel

Resolves COMPMID-4626

Exclude SVE & SVE2 paths from android.bp
NDK version does not support these extensions.

Change-Id: I49b147d2a84819975d3225f2920106fa1a0d742f
Signed-off-by: alerah01 <alex.rahlis@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7136
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
---
 Android.bp                                       |   3 +
 filelist.json                                    |  11 +-
 src/cpu/kernels/CpuDirectConv2dKernel.cpp        | 425 ++---------------------
 src/cpu/kernels/CpuDirectConv2dKernel.h          |  22 +-
 src/cpu/kernels/CpuKernelSelectionTypes.h        |   8 +
 src/cpu/kernels/directconv2d/list.h              |  47 +++
 src/cpu/kernels/directconv2d/nchw/all.cpp        | 179 ++++++++++
 src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp  |  39 +++
 src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp  | 252 ++++++++++++++
 src/cpu/kernels/directconv2d/nhwc/neon/impl.h    |  42 +++
 tests/validation/NEON/DirectConvolutionLayer.cpp |  37 ++
 11 files changed, 656 insertions(+), 409 deletions(-)
 create mode 100644 src/cpu/kernels/directconv2d/list.h
 create mode 100644 src/cpu/kernels/directconv2d/nchw/all.cpp
 create mode 100644 src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
 create mode 100644 src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
 create mode 100644 src/cpu/kernels/directconv2d/nhwc/neon/impl.h

diff --git a/Android.bp b/Android.bp
index db6e7faa26..3a49b8c362 100644
--- a/Android.bp
+++ b/Android.bp
@@ -448,6 +448,9 @@ cc_library_static {
         "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/directconv2d/nchw/all.cpp",
+        "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp",
+        "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp",
         "src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp",
         "src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp",
         "src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp",
diff --git a/filelist.json b/filelist.json
index 185ef6d43f..7e47df959c 100644
--- a/filelist.json
+++ b/filelist.json
@@ -848,7 +848,9 @@
           },
           "sve": {
             "fp16": [ "src/cpu/kernels/activation/generic/sve/fp16.cpp" ],
-            "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ],
+            "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ]
+          }, 
+          "sve2":{
             "qasymm8": [ "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp" ],
             "qasymm8_signed": [ "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp" ],
             "qsymm16": [ "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp" ]
@@ -1052,7 +1054,12 @@
               "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
               "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
               "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
-              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp" 
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
+              "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp",
+              "src/cpu/kernels/directconv2d/nchw/all.cpp"
+            ],
+            "fp32": [
+              "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp"
             ]
           }
         }
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
index f3560156bd..a4cdddee5e 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -22,26 +22,14 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/CpuDirectConv2dKernel.h"
+#include "src/cpu/kernels/directconv2d/list.h"
 
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include <algorithm>
-
 using namespace arm_compute::detail;
 
 namespace arm_compute
@@ -50,8 +38,25 @@ namespace cpu
 {
 namespace kernels
 {
-namespace
+static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels =
 {
+    {
+        "neon_fp32_nhwc_directconv2d",
+        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
+        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)
+    },
+    {
+        "neon_fp32_nchw_directconv2d",
+        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
+        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)
+    },
+    {
+        "neon_fp16_nchw_directconv2d",
+        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)
+    },
+};
+
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -99,346 +104,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     return std::make_pair(err, win);
 }
 
-bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
-{
-    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
-
-} // namespace
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // This function assumes that input and weights have not padding in channel
-
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-    /*
-     * This implementation parallelize the full WC plane of input and weights by
-     * treating them as series of elements. So for example, a 3x3 weights and
-     * floating point vector operations of 4 elements per time, the first 3
-     * channel elements of the first row would be taken and additionally the first
-     * element of the second row. The 9 elements in each single WC weight plane
-     * would require 2 4-element vector operations and a last single element operation.
-     *
-     * This works since when we create the input vector to multiply with the weights,
-     * the exact required elements are loaded in the same order. Therefore the
-     * multiplication works on the correct input/weight elements.
-     */
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        /*
-         * In here we create theoretical indexes which then we validate for both
-         * inputs and weights.
-         * As a reminder, this loop take each output point in NHW, C is treated
-         * in the weights loop.
-         */
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
-        const int index_h_start  = in_h_start - in_h_start_t;
-        const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
-        const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            /*
-             * This is the loop in the weights, and it goes along N (the batches)
-             * As a reminder, the batches of the weights are translated into the
-             * channels of the output
-             */
-            const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
-                                  + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
-            const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
-            uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
-            {
-                const T    *in_ptr_mover = in_ptr_row;
-                int         index_wc     = index_wc_start;
-                vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
-                {
-                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                    const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
-                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                }
-                out_temp += vreduce(out_temp_vec);
-                for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
-                {
-                    const auto src_val = *(in_ptr_mover);
-                    const auto w_val   = *(weights_ptr_row + index_wc);
-                    out_temp += src_val * w_val;
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(0);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-            {
-                const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
-                const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
-                for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
-                {
-                    const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                    const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                    int         index_c           = 0;
-                    vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
-                    {
-                        const auto src_val = *(in_ptr_mover);
-                        const auto w_val   = *(weights_ptr_mover);
-                        out_temp += src_val * w_val;
-                    }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
-    const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-
-    const int input_dim_w = src->info()->dimension(0);
-    const int input_dim_h = src->info()->dimension(1);
-
-    const int output_stride_c = dst->info()->strides_in_bytes()[2];
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
-
-    const int kernel_dim_w = weights->info()->dimension(0);
-    const int kernel_dim_h = weights->info()->dimension(1);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(2);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-            T              out_temp          = static_cast<T>(0);
-
-            for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
-            {
-                const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
-                const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
-                    const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
-                    int         index_w         = in_w_start;
-                    int         index_wei_w     = wei_w_start;
-                    vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_w < in_w_end; ++index_w, ++index_wei_w)
-                    {
-                        const auto src_val = *(in_ptr_row + index_w * input_stride_w);
-                        const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp += src_val * w_val;
-                    }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-
-        },
-        wei);
-    },
-    out);
-}
-
 void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -484,53 +149,21 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c
     auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto dst     = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_data_layout == DataLayout::NCHW)
-    {
-        switch(src->info()->data_type())
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                convolve_nchw<float16_t>(window, src, weights, dst);
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                convolve_nchw<float>(window, src, weights, dst);
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-    else
-    {
-        switch(src->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                if(have_zero_x_internal_padding(src->info(), weights->info()))
-                {
-                    convolve_nhwc_optimized<float>(window, src, weights, dst);
-                }
-                else
-                {
-                    convolve_nhwc<float>(window, src, weights, dst);
-                }
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
+    const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+    uk->ukernel(window, src, weights, dst, _conv_info);
 }
 const char *CpuDirectConv2dKernel::name() const
 {
     return "CpuDirectConvolutionLayerKernel";
 }
+
+const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> &CpuDirectConv2dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index 6ec4d4ee04..b9265dc630 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -36,6 +36,9 @@ namespace kernels
 /** Interface for the kernel to perform Direct Convolution Layer. */
 class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
 {
+private:
+    using DirectConv2dKernel_Ptr = std::add_pointer<void(const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
+
 public:
     CpuDirectConv2dKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
@@ -67,19 +70,16 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-private:
-    /* Template function for optimized convolution NHWC */
-    template <typename T>
-    void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
+    struct DirectConv2dKernel
+    {
+        const char                         *name;
+        const DataTypeDataLayoutSelectorPtr is_selected;
+        DirectConv2dKernel_Ptr              ukernel;
+    };
 
-    /* Template function for convolution NHWC */
-    template <typename T>
-    void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
-    /* Template function for convolution NCHW */
-    template <typename T>
-    void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
+    static const std::vector<DirectConv2dKernel> &get_available_kernels();
 
+private:
     PadStrideInfo _conv_info{};
     unsigned int  _kernel_size{ 0 };
     DataLayout    _data_layout{ DataLayout::UNKNOWN };
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 4a0ebd6e3f..8c5a39ad49 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -40,6 +40,13 @@ struct DataTypeISASelectorData
     cpuinfo::CpuIsaInfo isa;
 };
 
+struct DataTypeDataLayoutISASelectorData
+{
+    DataType                   dt;
+    DataLayout                 dl;
+    const cpuinfo::CpuIsaInfo &isa;
+};
+
 struct PoolDataTypeISASelectorData
 {
     DataType            dt;
@@ -63,6 +70,7 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData
 };
 // Selector pointer types
 using DataTypeISASelectorPtr                      = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using DataTypeDataLayoutSelectorPtr               = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
 using PoolDataTypeISASelectorPtr                  = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
 using ElementwiseDataTypeISASelectorPtr           = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
 using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h
new file mode 100644
index 0000000000..9a0472643d
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/list.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
+#define SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
+
+#include "src/core/common/Registrars.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \
+    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d);
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d);
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nchw_directconv2d);
+
+#undef DECLARE_DIRECT_CONV2D_KERNEL
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp
new file mode 100644
index 0000000000..a719fa50d6
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nchw/all.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    convolve_nchw<float>(window, src, weights, dst, conv_info);
+}
+
+template <typename T>
+void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_UNUSED(conv_info);
+
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
+    const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+
+    const int input_dim_w = src->info()->dimension(0);
+    const int input_dim_h = src->info()->dimension(1);
+
+    const int output_stride_c = dst->info()->strides_in_bytes()[2];
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
+
+    const int kernel_dim_w = weights->info()->dimension(0);
+    const int kernel_dim_h = weights->info()->dimension(1);
+
+    const int conv_pad_top  = conv_info.pad_top();
+    const int conv_pad_left = conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(conv_info.stride());
+    const int conv_stride_h = std::get<1>(conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // We are computing the theoretical starting input starting points
+        const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+        const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+        // We are computing the valid initial and ending input points by checking the borders
+        const int in_w_start = std::max(in_w_start_t, 0);
+        const int in_h_start = std::max(in_h_start_t, 0);
+        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+        // We use the input points to select the valid weight points to use
+        const int wei_w_start = in_w_start - in_w_start_t;
+        const int wei_h_start = in_h_start - in_h_start_t;
+        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+        const int      index_c_end  = weights->info()->dimension(2);
+        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+        execute_window_loop(window_w, [&](const Coordinates & id_w)
+        {
+            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+            T              out_temp          = static_cast<T>(0);
+
+            for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
+            {
+                const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
+                const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+                {
+                    const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
+                    const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+                    int         index_w         = in_w_start;
+                    int         index_wei_w     = wei_w_start;
+                    vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                    for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+                    {
+                        const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                    }
+                    out_temp += vreduce(out_temp_vec);
+                    for(; index_w < in_w_end; ++index_w, ++index_wei_w)
+                    {
+                        const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+                        const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+                        out_temp += src_val * w_val;
+                    }
+                }
+            }
+            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+
+        },
+        wei);
+    },
+    out);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template void convolve_nchw<float16_t>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template void convolve_nchw<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
new file mode 100644
index 0000000000..9982431de5
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    convolve_nhwc<float>(window, src, weights, dst, conv_info);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
new file mode 100644
index 0000000000..500ad1b420
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <algorithm>
+
+using namespace arm_compute::detail;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
+{
+    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
+}
+}
+
+template <typename T>
+void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+    const int input_dim_w    = src->info()->dimension(1);
+    const int input_dim_h    = src->info()->dimension(2);
+
+    const int output_stride_c = dst->info()->strides_in_bytes().x();
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
+    const int          kernel_dim_w    = weights->info()->dimension(1);
+    const int          kernel_dim_h    = weights->info()->dimension(2);
+
+    const int conv_pad_top  = conv_info.pad_top();
+    const int conv_pad_left = conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(conv_info.stride());
+    const int conv_stride_h = std::get<1>(conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    // nhwc optimized
+    if(have_zero_x_internal_padding(src->info(), weights->info()))
+    {
+        // This function assumes that input and weights have not padding in channel
+
+        /*
+        * This implementation parallelize the full WC plane of input and weights by
+        * treating them as series of elements. So for example, a 3x3 weights and
+        * floating point vector operations of 4 elements per time, the first 3
+        * channel elements of the first row would be taken and additionally the first
+        * element of the second row. The 9 elements in each single WC weight plane
+        * would require 2 4-element vector operations and a last single element operation.
+        *
+        * This works since when we create the input vector to multiply with the weights,
+        * the exact required elements are loaded in the same order. Therefore the
+        * multiplication works on the correct input/weight elements.
+        */
+        execute_window_loop(
+            window_out, [&](const Coordinates & id)
+        {
+            /*
+            * In here we create theoretical indexes which then we validate for both
+            * inputs and weights.
+            * As a reminder, this loop take each output point in NHW, C is treated
+            * in the weights loop.
+            */
+            // We are computing the theoretical starting input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+            // We use the input points to select the valid weight points to use
+            const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+            const int index_h_start  = in_h_start - in_h_start_t;
+            const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+            const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
+
+            execute_window_loop(
+                window_w, [&](const Coordinates & id_w)
+            {
+                /*
+                * This is the loop in the weights, and it goes along N (the batches)
+                * As a reminder, the batches of the weights are translated into the
+                * channels of the output
+                */
+                const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
+                                      + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+                const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+                uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
+
+                T out_temp = static_cast<T>(0);
+                for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+                {
+                    const T    *in_ptr_mover = in_ptr_row;
+                    int         index_wc     = index_wc_start;
+                    vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                    for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                    {
+                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
+                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                    }
+                    out_temp += vreduce(out_temp_vec);
+                    for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+                    {
+                        const auto src_val = *(in_ptr_mover);
+                        const auto w_val   = *(weights_ptr_row + index_wc);
+                        out_temp += src_val * w_val;
+                    }
+                }
+                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+            },
+            wei);
+        },
+        out);
+    }
+    else // nhwc non optimized
+    {
+        execute_window_loop(
+            window_out, [&](const Coordinates & id)
+        {
+            // We are computing the theoretical starting input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+            const int      index_c_end  = weights->info()->dimension(0);
+            const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+
+            execute_window_loop(
+                window_w, [&](const Coordinates & id_w)
+            {
+                const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+
+                T out_temp = static_cast<T>(0);
+                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+                {
+                    const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
+                    const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                    {
+                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                        int         index_c           = 0;
+                        vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                        for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
+                        {
+                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                            const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
+                            out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                        }
+                        out_temp += vreduce(out_temp_vec);
+                        for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                        {
+                            const auto src_val = *(in_ptr_mover);
+                            const auto w_val   = *(weights_ptr_mover);
+                            out_temp += src_val * w_val;
+                        }
+                    }
+                }
+                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+            },
+            wei);
+        },
+        out);
+    }
+}
+
+template void convolve_nhwc<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
new file mode 100644
index 0000000000..88a151fba4
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
+#define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
+
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif //SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 824741db5f..0f4c6bb279 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -180,6 +182,41 @@ TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT)
     validate(Accessor(dst), ref_dst);
 }
 
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               concat(combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F32 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                      combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F16 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW }))),
+               cpu_ext, data_type, data_layout)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ data_type, data_layout, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string data_layout_str;
+    if(data_layout == DataLayout::NCHW)
+    {
+        data_layout_str = "nchw";
+    }
+    else
+    {
+        data_layout_str = "nhwc";
+    }
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_" + data_layout_str + "_directconv2d";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-- 
cgit v1.2.1