From b4bb6a03f717a320b935809fde795b3d6ec5a69f Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Mon, 24 May 2021 16:01:32 +0100
Subject: Rename ported functions

Rename CpuPooling to CpuPool2d
Rename CpuPoolingKernel to CpuPool2dKernel
Rename CpuPoolingAssemblyWrapperKernel to CpuPool2dAssemblyWrapperKernel
Move CpuPool2dAssemblyWrapperKernel in internal subfolder
Rename CpuDepthwiseConvolutionNativeKernel to CpuDepthwiseConv2dNativeKernel
Rename CpuDepthwiseConvolutionAssemblyDispatch to CpuDepthwiseConv2dAssemblyDispatch
Rename CpuDepthwiseConvolution to CpuDepthwiseConv2d
Rename CpuDirectConvolutionKernel to CpuDirectConv2dKernel
Rename CpuDirectConvolutionOutputStageKernel to CpuDirectConv2dOutputStageKernel
Rename CpuDirectConvolution to CpuDirectConv2d
Rename ClPoolingKernel to ClPool2dKernel
Rename ClPooling to ClPool2d
Rename ClDirectConvolutionKernel to ClDirectConv2dKernel

Resolves: COMPMID-4405

Change-Id: I8e48f015e4e492a76a7512f5679cb3eb0cd028f6
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5708
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |   26 +-
 SConscript                                         |   17 +-
 arm_compute/runtime/CL/functions/CLPoolingLayer.h  |    2 +-
 .../NEON/functions/NEDirectConvolutionLayer.h      |    2 +-
 .../runtime/NEON/functions/NEPoolingLayer.h        |    2 +-
 scripts/clang_tidy_rules.py                        |    9 +-
 src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h    |    4 +-
 src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h  |    2 +-
 .../cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp |  919 +++++++++++++
 .../cpu/kernels/CpuDepthwiseConv2dNativeKernel.h   |  109 ++
 .../CpuDepthwiseConvolutionNativeKernel.cpp        |  918 -------------
 .../kernels/CpuDepthwiseConvolutionNativeKernel.h  |  117 --
 src/core/cpu/kernels/CpuDirectConv2dKernel.cpp     | 1385 ++++++++++++++++++++
 src/core/cpu/kernels/CpuDirectConv2dKernel.h       |   93 ++
 .../kernels/CpuDirectConv2dOutputStageKernel.cpp   |  513 ++++++++
 .../cpu/kernels/CpuDirectConv2dOutputStageKernel.h |   87 ++
 .../cpu/kernels/CpuDirectConvolutionKernel.cpp     | 1385 --------------------
 src/core/cpu/kernels/CpuDirectConvolutionKernel.h  |  100 --
 .../CpuDirectConvolutionOutputStageKernel.cpp      |  513 --------
 .../CpuDirectConvolutionOutputStageKernel.h        |   93 --
 src/core/cpu/kernels/CpuPool2dKernel.cpp           |  514 ++++++++
 src/core/cpu/kernels/CpuPool2dKernel.h             |   78 ++
 .../kernels/CpuPoolingAssemblyWrapperKernel.cpp    |  276 ----
 .../cpu/kernels/CpuPoolingAssemblyWrapperKernel.h  |  123 --
 src/core/cpu/kernels/CpuPoolingKernel.cpp          |  514 --------
 src/core/cpu/kernels/CpuPoolingKernel.h            |   83 --
 .../internal/CpuPool2dAssemblyWrapperKernel.cpp    |  276 ++++
 .../internal/CpuPool2dAssemblyWrapperKernel.h      |  119 ++
 src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp   |  665 ++++++++++
 src/core/gpu/cl/kernels/ClDirectConv2dKernel.h     |   87 ++
 .../gpu/cl/kernels/ClDirectConvolutionKernel.cpp   |  665 ----------
 .../gpu/cl/kernels/ClDirectConvolutionKernel.h     |   97 --
 src/core/gpu/cl/kernels/ClPool2dKernel.cpp         |  509 +++++++
 src/core/gpu/cl/kernels/ClPool2dKernel.h           |   76 ++
 src/core/gpu/cl/kernels/ClPoolingKernel.cpp        |  509 -------
 src/core/gpu/cl/kernels/ClPoolingKernel.h          |   79 --
 .../CL/functions/CLDirectConvolutionLayer.cpp      |   16 +-
 src/runtime/CL/functions/CLPoolingLayer.cpp        |   14 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |   66 +-
 .../NEON/functions/NEDirectConvolutionLayer.cpp    |   16 +-
 src/runtime/NEON/functions/NEPoolingLayer.cpp      |   16 +-
 src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp   |  523 ++++++++
 src/runtime/cpu/operators/CpuDepthwiseConv2d.h     |  213 +++
 .../CpuDepthwiseConv2dAssemblyDispatch.cpp         |  563 ++++++++
 .../operators/CpuDepthwiseConv2dAssemblyDispatch.h |   86 ++
 .../cpu/operators/CpuDepthwiseConvolution.cpp      |  523 --------
 .../cpu/operators/CpuDepthwiseConvolution.h        |  230 ----
 .../CpuDepthwiseConvolutionAssemblyDispatch.cpp    |  563 --------
 .../CpuDepthwiseConvolutionAssemblyDispatch.h      |   97 --
 src/runtime/cpu/operators/CpuDirectConv2d.cpp      |  147 +++
 src/runtime/cpu/operators/CpuDirectConv2d.h        |  107 ++
 src/runtime/cpu/operators/CpuDirectConvolution.cpp |  147 ---
 src/runtime/cpu/operators/CpuDirectConvolution.h   |  121 --
 src/runtime/cpu/operators/CpuPool2d.cpp            |  156 +++
 src/runtime/cpu/operators/CpuPool2d.h              |   87 ++
 src/runtime/cpu/operators/CpuPooling.cpp           |  156 ---
 src/runtime/cpu/operators/CpuPooling.h             |   99 --
 src/runtime/gpu/cl/operators/ClDirectConv2d.cpp    |  102 ++
 src/runtime/gpu/cl/operators/ClDirectConv2d.h      |   83 ++
 .../gpu/cl/operators/ClDirectConvolution.cpp       |  102 --
 src/runtime/gpu/cl/operators/ClDirectConvolution.h |   92 --
 src/runtime/gpu/cl/operators/ClPool2d.cpp          |  101 ++
 src/runtime/gpu/cl/operators/ClPool2d.h            |   72 +
 src/runtime/gpu/cl/operators/ClPooling.cpp         |  101 --
 src/runtime/gpu/cl/operators/ClPooling.h           |   75 --
 .../NEON/DepthwiseConvolutionLayerNative.cpp       |    6 +-
 66 files changed, 7768 insertions(+), 7878 deletions(-)
 create mode 100644 src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
 create mode 100644 src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
 create mode 100644 src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
 create mode 100644 src/core/cpu/kernels/CpuDirectConv2dKernel.h
 create mode 100644 src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
 create mode 100644 src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDirectConvolutionKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
 create mode 100644 src/core/cpu/kernels/CpuPool2dKernel.cpp
 create mode 100644 src/core/cpu/kernels/CpuPool2dKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuPoolingKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuPoolingKernel.h
 create mode 100644 src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
 create mode 100644 src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClPoolingKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClPoolingKernel.h
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.h
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.h
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
 create mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.h
 delete mode 100644 src/runtime/cpu/operators/CpuDirectConvolution.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuDirectConvolution.h
 create mode 100644 src/runtime/cpu/operators/CpuPool2d.cpp
 create mode 100644 src/runtime/cpu/operators/CpuPool2d.h
 delete mode 100644 src/runtime/cpu/operators/CpuPooling.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuPooling.h
 create mode 100644 src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClDirectConv2d.h
 delete mode 100644 src/runtime/gpu/cl/operators/ClDirectConvolution.cpp
 delete mode 100644 src/runtime/gpu/cl/operators/ClDirectConvolution.h
 create mode 100644 src/runtime/gpu/cl/operators/ClPool2d.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClPool2d.h
 delete mode 100644 src/runtime/gpu/cl/operators/ClPooling.cpp
 delete mode 100644 src/runtime/gpu/cl/operators/ClPooling.h

diff --git a/Android.bp b/Android.bp
index 13b70ea0b3..d1003f2d7d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -283,18 +283,17 @@ cc_library_static {
         "src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
         "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
         "src/core/cpu/kernels/CpuCopyKernel.cpp",
-        "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp",
+        "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
         "src/core/cpu/kernels/CpuDequantizeKernel.cpp",
-        "src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp",
-        "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp",
+        "src/core/cpu/kernels/CpuDirectConv2dKernel.cpp",
+        "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp",
         "src/core/cpu/kernels/CpuElementwiseKernel.cpp",
         "src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
         "src/core/cpu/kernels/CpuFillKernel.cpp",
         "src/core/cpu/kernels/CpuFloorKernel.cpp",
         "src/core/cpu/kernels/CpuMulKernel.cpp",
         "src/core/cpu/kernels/CpuPermuteKernel.cpp",
-        "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp",
-        "src/core/cpu/kernels/CpuPoolingKernel.cpp",
+        "src/core/cpu/kernels/CpuPool2dKernel.cpp",
         "src/core/cpu/kernels/CpuQuantizeKernel.cpp",
         "src/core/cpu/kernels/CpuReshapeKernel.cpp",
         "src/core/cpu/kernels/CpuScaleKernel.cpp",
@@ -321,6 +320,7 @@ cc_library_static {
         "src/core/cpu/kernels/add/sve/qsymm16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp32.cpp",
+        "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
         "src/core/cpu/kernels/pooling/neon/fp16.cpp",
         "src/core/cpu/kernels/pooling/neon/fp32.cpp",
         "src/core/cpu/kernels/pooling/neon/nchw/all.cpp",
@@ -348,7 +348,7 @@ cc_library_static {
         "src/core/gpu/cl/kernels/ClCropKernel.cpp",
         "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
         "src/core/gpu/cl/kernels/ClDequantizeKernel.cpp",
-        "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp",
+        "src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp",
         "src/core/gpu/cl/kernels/ClElementwiseKernel.cpp",
         "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
         "src/core/gpu/cl/kernels/ClFillKernel.cpp",
@@ -362,7 +362,7 @@ cc_library_static {
         "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
         "src/core/gpu/cl/kernels/ClMulKernel.cpp",
         "src/core/gpu/cl/kernels/ClPermuteKernel.cpp",
-        "src/core/gpu/cl/kernels/ClPoolingKernel.cpp",
+        "src/core/gpu/cl/kernels/ClPool2dKernel.cpp",
         "src/core/gpu/cl/kernels/ClQuantizeKernel.cpp",
         "src/core/gpu/cl/kernels/ClReshapeKernel.cpp",
         "src/core/gpu/cl/kernels/ClScaleKernel.cpp",
@@ -635,10 +635,10 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuConcatenate.cpp",
         "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp",
         "src/runtime/cpu/operators/CpuCopy.cpp",
-        "src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp",
-        "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp",
+        "src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp",
+        "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp",
         "src/runtime/cpu/operators/CpuDequantize.cpp",
-        "src/runtime/cpu/operators/CpuDirectConvolution.cpp",
+        "src/runtime/cpu/operators/CpuDirectConv2d.cpp",
         "src/runtime/cpu/operators/CpuElementwise.cpp",
         "src/runtime/cpu/operators/CpuElementwiseUnary.cpp",
         "src/runtime/cpu/operators/CpuFill.cpp",
@@ -647,7 +647,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp",
         "src/runtime/cpu/operators/CpuMul.cpp",
         "src/runtime/cpu/operators/CpuPermute.cpp",
-        "src/runtime/cpu/operators/CpuPooling.cpp",
+        "src/runtime/cpu/operators/CpuPool2d.cpp",
         "src/runtime/cpu/operators/CpuQuantize.cpp",
         "src/runtime/cpu/operators/CpuReshape.cpp",
         "src/runtime/cpu/operators/CpuScale.cpp",
@@ -663,7 +663,7 @@ cc_library_static {
         "src/runtime/gpu/cl/operators/ClCopy.cpp",
         "src/runtime/gpu/cl/operators/ClCrop.cpp",
         "src/runtime/gpu/cl/operators/ClDequantize.cpp",
-        "src/runtime/gpu/cl/operators/ClDirectConvolution.cpp",
+        "src/runtime/gpu/cl/operators/ClDirectConv2d.cpp",
         "src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp",
         "src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp",
         "src/runtime/gpu/cl/operators/ClFill.cpp",
@@ -674,7 +674,7 @@ cc_library_static {
         "src/runtime/gpu/cl/operators/ClMul.cpp",
         "src/runtime/gpu/cl/operators/ClPRelu.cpp",
         "src/runtime/gpu/cl/operators/ClPermute.cpp",
-        "src/runtime/gpu/cl/operators/ClPooling.cpp",
+        "src/runtime/gpu/cl/operators/ClPool2d.cpp",
         "src/runtime/gpu/cl/operators/ClQuantize.cpp",
         "src/runtime/gpu/cl/operators/ClReshape.cpp",
         "src/runtime/gpu/cl/operators/ClScale.cpp",
diff --git a/SConscript b/SConscript
index ac15c7fcfb..143823d013 100644
--- a/SConscript
+++ b/SConscript
@@ -300,12 +300,11 @@ if env['neon']:
 
     cpu_kernel_hp_files = ['src/core/cpu/kernels/CpuActivationKernel.cpp',
                            'src/core/cpu/kernels/CpuCastKernel.cpp',
-                           'src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp',
+                           'src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp',
+                           'src/core/cpu/kernels/CpuDirectConv2dKernel.cpp',
+                           'src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp',
                            'src/core/cpu/kernels/CpuPermuteKernel.cpp',
-                           'src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp',
-                           'src/core/cpu/kernels/CpuPoolingKernel.cpp',
+                           'src/core/cpu/kernels/CpuPool2dKernel.cpp',
                            'src/core/cpu/kernels/CpuReshapeKernel.cpp',
                           ]
     cpu_kernel_files = ['src/core/cpu/kernels/CpuAddKernel.cpp',
@@ -352,12 +351,12 @@ if env['neon']:
                    ]
     cpu_operator_hp_files = ['src/runtime/cpu/operators/CpuActivation.cpp',
                              'src/runtime/cpu/operators/CpuCast.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp',
-                             'src/runtime/cpu/operators/CpuDirectConvolution.cpp',
+                             'src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp',
+                             'src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp',
+                             'src/runtime/cpu/operators/CpuDirectConv2d.cpp',
                              'src/runtime/cpu/operators/CpuFlatten.cpp',
                              'src/runtime/cpu/operators/CpuPermute.cpp',
-                             'src/runtime/cpu/operators/CpuPooling.cpp',
+                             'src/runtime/cpu/operators/CpuPool2d.cpp',
                             ]
     cpu_operator_files = ['src/runtime/cpu/operators/CpuAdd.cpp',
                           'src/runtime/cpu/operators/CpuConcatenate.cpp',
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 902feca234..1975e15470 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -36,7 +36,7 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 
-/** Basic function to run  @ref opencl::ClPooling */
+/** Basic function to run  @ref opencl::ClPool2d */
 class CLPoolingLayer : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 82cabed6c9..70352fdfaa 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -39,7 +39,7 @@ class ITensorInfo;
  *
  *  This function calls the following:
  *
- * -# @ref cpu::CpuDirectConvolution
+ * -# @ref cpu::CpuDirectConv2d
  */
 class NEDirectConvolutionLayer : public IFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 1de587b444..b5366fa1c1 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -39,7 +39,7 @@ class ITensorInfo;
 
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
  *
- * -# @ref cpu::CpuPooling
+ * -# @ref cpu::CpuPool2d
  */
 class NEPoolingLayer : public IFunction
 {
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 97929b4eac..649d9343bb 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -108,15 +108,14 @@ def filter_clang_tidy_lines( lines ):
                ("NEWinogradLayerKernel.cpp" in line and "use '= default' to define a trivial destructor" in line) or
                ("NEGEMMLowpMatrixMultiplyCore.cpp" in line and "constructor does not initialize these fields" in line) or
                ("NEGEMMLowpAssemblyMatrixMultiplyCore" in line and "constructor does not initialize these fields" in line) or
-               ("CpuDepthwiseConvolutionNativeKernel" in line and re.search(r"parameter '[^']+' is unused", line)) or
-               ("CpuDepthwiseConvolutionAssemblyDispatch" in line and re.search(r"parameter '[^']+' is unused", line)) or
-               ("CpuDepthwiseConvolutionAssemblyDispatch" in line and "modernize-use-equals-default" in line) or
+               ("CpuDepthwiseConv2dNativeKernel" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("CpuDepthwiseConv2dAssemblyDispatch" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("CpuDepthwiseConv2dAssemblyDispatch" in line and "modernize-use-equals-default" in line) or
                ("CPUUtils.cpp" in line and "consider replacing 'unsigned long' with 'uint64'" in line) or
                ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or
                ("CPUUtils.cpp" in line and "warning: uninitialized record type" in line) or
                ("Utils.h" in line and "warning: Use of zero-allocated memory" in line) or
-               ("CpuDepthwiseConvolutionNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line) or # This is to prevent false positive, should be reassessed with the newer clang-tidy
-               ("CpuDepthwiseConvolutionNativeKernel.cpp" in line and "cppcoreguidelines-pro-type-member-init" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
+               ("CpuDepthwiseConv2dNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
                 print_context=False
                 continue
 
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index cc96cf1a1f..45481d0507 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -53,7 +53,7 @@ public:
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
-     *                             @ref opencl::ClPooling with indices should precede this function in order to
+     *                             @ref CLPoolingLayer with indices should precede this function in order to
      *                             properly reconstruct the output tensor.
      *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
@@ -65,7 +65,7 @@ public:
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
-     *                      @ref opencl::ClPooling with indices should precede this function in order to
+     *                      @ref CLPoolingLayer with indices should precede this function in order to
      *                      properly reconstruct the output tensor.
      *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index f42272826c..ecc116e585 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -56,7 +56,7 @@ public:
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
-     *                       @ref cpu::kernels::CpuPoolingKernel with indices should precede this function in order to
+     *                       @ref NEPoolingLayer with indices should precede this function in order to
      *                       properly reconstruct the output tensor.
      *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
new file mode 100644
index 0000000000..4ddb35f2d5
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+constexpr auto data_layout = DataLayout::NHWC;
+const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
+constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
+constexpr size_t vector_size          = 8;
+
+struct DepthwiseConvolutionRunInfo
+{
+    const size_t   num_read_elements_per_iteration;
+    const uint32_t x_start;
+    const uint32_t x_end;
+    const uint32_t x_step;
+    const uint32_t x_leftover_start;
+    const size_t   input_stride_y;
+    const size_t   input_stride_z;
+    const size_t   input_max_offset;
+    const size_t   weights_width;
+    const size_t   weights_height;
+    const size_t   weights_stride_y;
+    const size_t   weights_stride_z;
+    const size_t   conv_stride_x;
+    const size_t   conv_stride_y;
+    const size_t   conv_pad_left;
+    const size_t   conv_pad_top;
+    const size_t   input_height;
+    const size_t   input_width;
+    const size_t   input_depth;
+
+    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
+        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+          x_start(w.x().start()),
+          x_end(w.x().end()),
+          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
+          x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
+          input_stride_y(input.strides_in_bytes().y()),
+          input_stride_z(input.strides_in_bytes().z()),
+          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+          weights_width(weights.dimension(width_idx)),
+          weights_height(weights.dimension(height_idx)),
+          weights_stride_y(weights.strides_in_bytes().y()),
+          weights_stride_z(weights.strides_in_bytes().z()),
+          conv_stride_x(conv_info.stride().first),
+          conv_stride_y(conv_info.stride().second),
+          conv_pad_left(conv_info.pad_left()),
+          conv_pad_top(conv_info.pad_top()),
+          input_height(input.dimension(height_idx)),
+          input_width(input.dimension(width_idx)),
+          input_depth(input.dimension(channel_idx))
+    {
+    }
+};
+
+inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+{
+    const int32_t current_h  = base_h + h * dilation.y();
+    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
+
+    const int32_t current_w  = base_w + w * dilation.x();
+    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
+
+    return is_valid_h && is_valid_w;
+}
+
+template <typename T>
+void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                   const Size2D &dilation, const Window &window, bool has_biases)
+{
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
+
+    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
+
+    Window win_input = window;
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = win_input;
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto const base_weights_ptr = weights_it.ptr();
+        uint32_t   x                = run_info.x_start;
+
+        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+        {
+            VectorType acc          = zero_vector;
+            auto       weights_ptr  = base_weights_ptr;
+            int64_t    input_offset = base_input_offset;
+
+            for(uint32_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(uint32_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ?
+                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+                                                 zero_vector;
+                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            if(has_biases)
+            {
+                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                acc                    = wrapper::vadd(acc, biases_vals);
+            }
+
+            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
+        }
+
+        for(; x < run_info.x_end; ++x)
+        {
+            auto    acc_scalar   = T{ 0 };
+            auto    weights_ptr  = base_weights_ptr;
+            int64_t input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
+                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    acc_scalar += (input_vals * weights_vals);
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            if(has_biases)
+            {
+                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                acc_scalar += biases_vals;
+            }
+            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T>
+void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
+{
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
+
+        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto weights_ptr = weights_it.ptr();
+        for(size_t h = 0; h < run_info.weights_height; ++h)
+        {
+            int offs = input_offset;
+            for(size_t w = 0; w < run_info.weights_width; ++w)
+            {
+                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
+
+                for(size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                }
+
+                offs += dilation.x() * run_info.input_stride_y;
+            }
+
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
+        }
+
+        if(has_biases)
+        {
+            for(size_t m = 0; m < depth_multiplier; ++m)
+            {
+                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+            }
+        }
+        else
+        {
+            for(size_t m = 0; m < depth_multiplier; ++m)
+            {
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+            }
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+{
+    ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+    using AccType                     = int32_t;
+    using AccArrayType                = std::array<AccType, element_per_vector>;
+
+    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
+
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
+    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
+
+    Window win_input = window;
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = win_input;
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+        auto const    base_weights_ptr  = weights_it.ptr();
+        size_t        x                 = run_info.x_start;
+
+        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+        {
+            AccArrayType acc{};
+            AccArrayType in_sum{};
+            AccArrayType we_sum{};
+
+            auto weights_ptr  = base_weights_ptr;
+            auto input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ?
+                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+                                                 out_of_bound_vector;
+                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    for(size_t i = 0; i < element_per_vector; ++i)
+                    {
+                        acc.at(i) += input_vals[i] * weights_vals[i];
+                        in_sum.at(i) += input_vals[i];
+                        we_sum.at(i) += weights_vals[i];
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+            for(size_t i = 0; i < element_per_vector; ++i)
+            {
+                acc.at(i) -= in_sum.at(i) * weights_qoffset;
+                acc.at(i) -= we_sum.at(i) * input_qoffset;
+                acc.at(i) += k_offset;
+
+                if(has_biases)
+                {
+                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                }
+
+                const int32_t out_mul   = output_multiplier.at(x + i);
+                const int32_t out_shift = output_shift.at(x + i);
+                if(out_shift < 0)
+                {
+                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
+                }
+                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+            }
+
+            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
+        }
+
+        // left-over
+        for(; x < run_info.x_end; ++x)
+        {
+            AccType acc    = 0;
+            AccType in_sum = 0;
+            AccType we_sum = 0;
+
+            auto weights_ptr  = base_weights_ptr;
+            auto input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val       = is_valid_region ?
+                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
+                                                 out_of_bound_value;
+                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    acc += input_val * weights_val;
+                    in_sum += input_val;
+                    we_sum += weights_val;
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            T out_vals{ 0 };
+
+            acc -= in_sum * weights_qoffset;
+            acc -= we_sum * input_qoffset;
+            acc += k_offset;
+
+            if(has_biases)
+            {
+                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+            }
+
+            const int32_t out_mul   = output_multiplier.at(x);
+            const int32_t out_shift = output_shift.at(x);
+
+            if(out_shift < 0)
+            {
+                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
+            }
+            else
+            {
+                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+            }
+
+            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
+            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+{
+    using AccType = int32_t;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
+    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        std::vector<AccType> acc(depth_multiplier, 0);
+        std::vector<AccType> we_sum(depth_multiplier, 0);
+        AccType              in_sum = 0;
+
+        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto weights_ptr = weights_it.ptr();
+        for(size_t h = 0; h < run_info.weights_height; ++h)
+        {
+            int offs = input_offset;
+            for(size_t w = 0; w < run_info.weights_width; ++w)
+            {
+                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
+
+                for(size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                    acc.at(m) += input_val * weights_val;
+
+                    we_sum.at(m) += weights_val;
+                }
+
+                offs += dilation.x() * run_info.input_stride_y;
+                in_sum += input_val;
+            }
+
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
+        }
+
+        for(size_t m = 0; m < depth_multiplier; ++m)
+        {
+            acc.at(m) -= in_sum * weights_qoffset;
+            acc.at(m) -= we_sum.at(m) * input_qoffset;
+            acc.at(m) += k_offset;
+
+            if(has_biases)
+            {
+                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+            }
+
+            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
+            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+            if(out_shift < 0)
+            {
+                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+            }
+            else
+            {
+                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
+            }
+            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+{
+    constexpr int half_vec = vector_size / 2;
+
+    using AccType          = int32_t;
+    using AccVectorType    = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
+    using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
+    using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
+    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+
+    const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
+    const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
+    const auto zero  = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
+
+    const auto out_mul   = output_multiplier.at(0);
+    const auto out_shift = output_shift.at(0);
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
+    std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        std::fill(begin(acc0), end(acc0), zero);
+        std::fill(begin(acc1), end(acc1), zero);
+
+        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto weights_ptr = weights_it.ptr();
+        for(size_t h = 0; h < run_info.weights_height; ++h)
+        {
+            const int32_t current_h = input_z + h * dilation.y();
+            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+            {
+                int offs = input_offset;
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const int32_t current_w = input_y + w * dilation.x();
+                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+                    {
+                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
+                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
+
+                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                        {
+                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
+                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
+                        }
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+            }
+
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
+        }
+
+        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+        {
+            if(has_biases)
+            {
+                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+
+                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+            }
+
+            if(out_shift < 0)
+            {
+                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
+                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
+            }
+            else
+            {
+                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
+                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
+            }
+
+            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+
+            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
+                                                   wrapper::vmovn(acc1.at(i)));
+
+            if(std::is_same<T, uint8_t>::value)
+            {
+                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+            }
+            else
+            {
+                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
+            }
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
+
+    if(is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
+
+        if(is_data_type_quantized_asymmetric(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+    }
+
+    if(dst->total_size() != 0)
+    {
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CpuDepthwiseConv2dNativeKernel::CpuDepthwiseConv2dNativeKernel()
+    : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
+{
+}
+
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
+
+    _conv_info        = info.pad_stride_info;
+    _depth_multiplier = info.depth_multiplier;
+    _dilation         = info.dilation;
+    _has_biases       = (biases != nullptr);
+
+    if(is_data_type_quantized(src->data_type()))
+    {
+        const auto input_scale  = src->quantization_info().uniform().scale;
+        const auto output_scale = dst->quantization_info().uniform().scale;
+
+        auto weights_scale = weights->quantization_info().scale();
+        if(!is_data_type_quantized_per_channel(weights->data_type()))
+        {
+            for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
+            {
+                weights_scale.push_back(weights_scale.front());
+            }
+        }
+
+        for(const auto &s : weights_scale)
+        {
+            int32_t     out_mult   = 0;
+            int32_t     out_shift  = 0;
+            const float multiplier = input_scale * s / output_scale;
+            arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
+
+            _output_multiplier.push_back(out_mult);
+            _output_shift.push_back(out_shift);
+        }
+    }
+
+    switch(weights->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
+            break;
+        case DataType::QASYMM8_SIGNED:
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
+            break;
+        case DataType::QSYMM8_PER_CHANNEL:
+            if(src->data_type() == DataType::QASYMM8)
+            {
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
+            }
+            else
+            {
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
+            }
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
+
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
+
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
+    return Status{};
+}
+
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    if(_depth_multiplier == 1)
+    {
+        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
+    }
+    else
+    {
+        depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
+    }
+}
+
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    if(_depth_multiplier == 1)
+    {
+        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
+    }
+    else
+    {
+        const bool is_pow2                 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
+        const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
+
+        if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
+        {
+            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
+        }
+        else
+        {
+            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
+        }
+    }
+}
+
+void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
+    (this->*_func)(src, weights, biases, dst, window, _has_biases);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
new file mode 100644
index 0000000000..559c46dc93
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
+
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "support/Requires.h"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_neon.h>
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to run a depthwise convolution native on a tensor. */
+class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
+{
+public:
+    const char *name() const override
+    {
+        return "CpuDepthwiseConv2dNativeKernel";
+    }
+    /** Default constructor */
+    CpuDepthwiseConv2dNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
+
+    /** Initialize the function's source, destination and parameters.
+     *
+     * @note Supported data layouts: NHWC
+     *
+     * @param[in]  src     Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
+     *                     Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst     Destination tensor. Data type supported: Same as @p src.
+     * @param[in]  info    Depthwise convolution meta-data.
+     *
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2dNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    template <typename T>
+    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
+
+    template <typename T, typename TW, FloatEnalber<T> = 0>
+    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+    template <typename T>
+    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
+
+    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
+    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+    /** Common signature for all the specialised depthwise convolution native functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+    DepthwiseFunctionPtr _func;
+    PadStrideInfo        _conv_info;
+    unsigned int         _depth_multiplier;
+    Size2D               _dilation;
+    std::vector<int>     _output_multiplier;
+    std::vector<int>     _output_shift;
+    bool                 _has_biases;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
deleted file mode 100644
index a5d1b61c08..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
+++ /dev/null
@@ -1,918 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto data_layout = DataLayout::NHWC;
-const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
-constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
-constexpr size_t vector_size          = 8;
-
-struct DepthwiseConvolutionRunInfo
-{
-    const size_t   num_read_elements_per_iteration;
-    const uint32_t x_start;
-    const uint32_t x_end;
-    const uint32_t x_step;
-    const uint32_t x_leftover_start;
-    const size_t   input_stride_y;
-    const size_t   input_stride_z;
-    const size_t   input_max_offset;
-    const size_t   weights_width;
-    const size_t   weights_height;
-    const size_t   weights_stride_y;
-    const size_t   weights_stride_z;
-    const size_t   conv_stride_x;
-    const size_t   conv_stride_y;
-    const size_t   conv_pad_left;
-    const size_t   conv_pad_top;
-    const size_t   input_height;
-    const size_t   input_width;
-    const size_t   input_depth;
-
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
-        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
-          x_start(w.x().start()),
-          x_end(w.x().end()),
-          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
-          x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
-          input_stride_y(input.strides_in_bytes().y()),
-          input_stride_z(input.strides_in_bytes().z()),
-          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
-          weights_width(weights.dimension(width_idx)),
-          weights_height(weights.dimension(height_idx)),
-          weights_stride_y(weights.strides_in_bytes().y()),
-          weights_stride_z(weights.strides_in_bytes().z()),
-          conv_stride_x(conv_info.stride().first),
-          conv_stride_y(conv_info.stride().second),
-          conv_pad_left(conv_info.pad_left()),
-          conv_pad_top(conv_info.pad_top()),
-          input_height(input.dimension(height_idx)),
-          input_width(input.dimension(width_idx)),
-          input_depth(input.dimension(channel_idx))
-    {
-    }
-};
-
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
-{
-    const int32_t current_h  = base_h + h * dilation.y();
-    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
-
-    const int32_t current_w  = base_w + w * dilation.x();
-    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
-
-    return is_valid_h && is_valid_w;
-}
-
-template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                   const Size2D &dilation, const Window &window, bool has_biases)
-{
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
-
-    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto const base_weights_ptr = weights_it.ptr();
-        uint32_t   x                = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            VectorType acc          = zero_vector;
-            auto       weights_ptr  = base_weights_ptr;
-            int64_t    input_offset = base_input_offset;
-
-            for(uint32_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(uint32_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 zero_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc                    = wrapper::vadd(acc, biases_vals);
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
-        }
-
-        for(; x < run_info.x_end; ++x)
-        {
-            auto    acc_scalar   = T{ 0 };
-            auto    weights_ptr  = base_weights_ptr;
-            int64_t input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
-                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc_scalar += (input_vals * weights_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc_scalar += biases_vals;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T>
-void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
-{
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
-
-        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        if(has_biases)
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
-            }
-        }
-        else
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
-{
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-    using AccType                     = int32_t;
-    using AccArrayType                = std::array<AccType, element_per_vector>;
-
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
-    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
-
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-        auto const    base_weights_ptr  = weights_it.ptr();
-        size_t        x                 = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            AccArrayType acc{};
-            AccArrayType in_sum{};
-            AccArrayType we_sum{};
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 out_of_bound_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    for(size_t i = 0; i < element_per_vector; ++i)
-                    {
-                        acc.at(i) += input_vals[i] * weights_vals[i];
-                        in_sum.at(i) += input_vals[i];
-                        we_sum.at(i) += weights_vals[i];
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < element_per_vector; ++i)
-            {
-                acc.at(i) -= in_sum.at(i) * weights_qoffset;
-                acc.at(i) -= we_sum.at(i) * input_qoffset;
-                acc.at(i) += k_offset;
-
-                if(has_biases)
-                {
-                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
-                }
-
-                const int32_t out_mul   = output_multiplier.at(x + i);
-                const int32_t out_shift = output_shift.at(x + i);
-                if(out_shift < 0)
-                {
-                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
-                }
-                else
-                {
-                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
-                }
-                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
-        }
-
-        // left-over
-        for(; x < run_info.x_end; ++x)
-        {
-            AccType acc    = 0;
-            AccType in_sum = 0;
-            AccType we_sum = 0;
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_val       = is_valid_region ?
-                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
-                                                 out_of_bound_value;
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc += input_val * weights_val;
-                    in_sum += input_val;
-                    we_sum += weights_val;
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            T out_vals{ 0 };
-
-            acc -= in_sum * weights_qoffset;
-            acc -= we_sum * input_qoffset;
-            acc += k_offset;
-
-            if(has_biases)
-            {
-                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
-            }
-
-            const int32_t out_mul   = output_multiplier.at(x);
-            const int32_t out_shift = output_shift.at(x);
-
-            if(out_shift < 0)
-            {
-                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
-            }
-
-            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
-{
-    using AccType = int32_t;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
-
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<AccType> acc(depth_multiplier, 0);
-        std::vector<AccType> we_sum(depth_multiplier, 0);
-        AccType              in_sum = 0;
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m) += input_val * weights_val;
-
-                    we_sum.at(m) += weights_val;
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-                in_sum += input_val;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0; m < depth_multiplier; ++m)
-        {
-            acc.at(m) -= in_sum * weights_qoffset;
-            acc.at(m) -= we_sum.at(m) * input_qoffset;
-            acc.at(m) += k_offset;
-
-            if(has_biases)
-            {
-                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-            }
-
-            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
-            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
-            if(out_shift < 0)
-            {
-                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
-{
-    constexpr int half_vec = vector_size / 2;
-
-    using AccType          = int32_t;
-    using AccVectorType    = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
-    using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
-    using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
-    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
-
-    const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
-    const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
-    const auto zero  = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
-
-    const auto out_mul   = output_multiplier.at(0);
-    const auto out_shift = output_shift.at(0);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
-    std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::fill(begin(acc0), end(acc0), zero);
-        std::fill(begin(acc1), end(acc1), zero);
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            const int32_t current_h = input_z + h * dilation.y();
-            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
-            {
-                int offs = input_offset;
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const int32_t current_w = input_y + w * dilation.x();
-                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
-                    {
-                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
-                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
-                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
-                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-                        {
-                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
-                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
-                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
-                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
-                        }
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-        {
-            if(has_biases)
-            {
-                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
-
-                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
-                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
-            }
-
-            if(out_shift < 0)
-            {
-                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-            }
-            else
-            {
-                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
-            }
-
-            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
-            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
-
-            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
-                                                   wrapper::vmovn(acc1.at(i)));
-
-            if(std::is_same<T, uint8_t>::value)
-            {
-                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
-            }
-            else
-            {
-                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > input->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > input->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * info.depth_multiplier) != weights->dimension(0));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
-
-    if(is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
-
-        if(is_data_type_quantized_asymmetric(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CpuDepthwiseConvolutionNativeKernel::CpuDepthwiseConvolutionNativeKernel()
-    : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
-{
-}
-
-void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, weights, (biases != nullptr) ? biases : nullptr, output, info));
-
-    _conv_info        = info.pad_stride_info;
-    _depth_multiplier = info.depth_multiplier;
-    _dilation         = info.dilation;
-    _has_biases       = (biases != nullptr);
-
-    if(is_data_type_quantized(input->data_type()))
-    {
-        const auto input_scale  = input->quantization_info().uniform().scale;
-        const auto output_scale = output->quantization_info().uniform().scale;
-
-        auto weights_scale = weights->quantization_info().scale();
-        if(!is_data_type_quantized_per_channel(weights->data_type()))
-        {
-            for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
-            {
-                weights_scale.push_back(weights_scale.front());
-            }
-        }
-
-        for(const auto &s : weights_scale)
-        {
-            int32_t     out_mult   = 0;
-            int32_t     out_shift  = 0;
-            const float multiplier = input_scale * s / output_scale;
-            arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
-
-            _output_multiplier.push_back(out_mult);
-            _output_shift.push_back(out_shift);
-        }
-    }
-
-    switch(weights->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
-            break;
-        case DataType::QSYMM8_PER_CHANNEL:
-            if(input->data_type() == DataType::QASYMM8)
-            {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
-            }
-            else
-            {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
-
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
-    Window win = calculate_max_window(*output, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuDepthwiseConvolutionNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, info));
-    return Status{};
-}
-
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
-    }
-    else
-    {
-        depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
-    }
-}
-
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
-    }
-    else
-    {
-        const bool is_pow2                 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
-        const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
-
-        if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
-        {
-            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-        else
-        {
-            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-    }
-}
-
-void CpuDepthwiseConvolutionNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    const auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, weights, biases, dst, window, _has_biases);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
deleted file mode 100644
index 242536d441..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
-
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "support/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConvolutionNativeKernel : public ICpuKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CpuDepthwiseConvolutionNativeKernel";
-    }
-    /** Default constructor */
-    CpuDepthwiseConvolutionNativeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConvolutionNativeKernel);
-
-    /** Initialize the function's source, destination and parameters.
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in]  input   Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                     Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  info    Depthwise convolution meta-data.
-     *
-     */
-    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionNativeKernel
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in] input   Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
-
-    template <typename T, typename TW, FloatEnalber<T> = 0>
-    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    template <typename T>
-    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
-
-    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
-    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    /** Common signature for all the specialised depthwise convolution native functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DepthwiseFunctionPtr = void (CpuDepthwiseConvolutionNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    DepthwiseFunctionPtr _func;
-    PadStrideInfo        _conv_info;
-    unsigned int         _depth_multiplier;
-    Size2D               _dilation;
-    std::vector<int>     _output_multiplier;
-    std::vector<int>     _output_shift;
-    bool                 _has_biases;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
new file mode 100644
index 0000000000..c0fc41525e
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -0,0 +1,1385 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
+
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <algorithm>
+
+using namespace arm_compute::detail;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <unsigned int stridex>
+float16x8_t internal_vld1q(const float16_t *in);
+
+template <>
+float16x8_t internal_vld1q<1>(const float16_t *in)
+{
+    return vld1q_f16(in);
+}
+
+template <>
+float16x8_t internal_vld1q<2>(const float16_t *in)
+{
+    const float16x8x2_t tmp = vld2q_f16(in);
+    return tmp.val[0];
+}
+
+template <>
+float16x8_t internal_vld1q<3>(const float16_t *in)
+{
+    const float16x8x3_t tmp = vld3q_f16(in);
+    return tmp.val[0];
+}
+
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+    return vdupq_n_f16(v);
+}
+
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+    vst1q_f16(p, v);
+}
+
+float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
+{
+    return vmulq_f16(x, y);
+}
+
+inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
+{
+    return vaddq_f16(x, vmulq_f16(y, z));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <unsigned int stridex>
+float32x4_t internal_vld1q(const float *in);
+
+template <>
+float32x4_t internal_vld1q<1>(const float *in)
+{
+    return vld1q_f32(in);
+}
+
+template <>
+float32x4_t internal_vld1q<2>(const float *in)
+{
+    const float32x4x2_t tmp = vld2q_f32(in);
+    return tmp.val[0];
+}
+
+template <>
+float32x4_t internal_vld1q<3>(const float *in)
+{
+    const float32x4x3_t tmp = vld3q_f32(in);
+    return tmp.val[0];
+}
+
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
+{
+    return vmulq_f32(x, y);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
+{
+    return vmlaq_f32(x, y, z);
+}
+
+constexpr int small_tensor_size_optim = 8;
+inline bool run_optim_small_tensor_info(const ITensorInfo *t)
+{
+    return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
+}
+
+inline bool run_optim_small_tensor(const ITensor *t)
+{
+    return run_optim_small_tensor_info(t->info());
+}
+
+// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
+// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
+// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
+template <unsigned int stridex>
+class convolver_w1x1_i8x8_f32
+{
+public:
+    static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
+        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
+
+        const int          input_stride_x  = src->info()->strides_in_bytes().x();
+        const int          input_stride_y  = src->info()->strides_in_bytes().y();
+        const int          input_stride_z  = src->info()->strides_in_bytes().z();
+        const int          output_stride_y = dst->info()->strides_in_bytes().y();
+        const int          output_stride_z = dst->info()->strides_in_bytes().z();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_h        = dst->info()->dimension(1);
+        const int          range_z         = window.z().end() - window.z().start();
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
+        Iterator out(dst, window_out);
+        Iterator in(src, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            for(int oz = 0; oz < range_z; ++oz)
+            {
+                accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
+                accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
+                auto p_out_base                                                                               = out_ptr + oz * output_stride_z;
+                for(int p = 0; p < kernel_depth; ++p)
+                {
+                    const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk0   = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
+                        auto      v_in0     = internal_vld1q<stridex>(in_val);
+                        auto      v_in1     = internal_vld1q<stridex>(in_val + 4);
+                        accum0[oh]          = vmlaq_f32(accum0[oh], vk0, v_in0);
+                        accum1[oh]          = vmlaq_f32(accum1[oh], vk0, v_in1);
+                    }
+                }
+                for(oh = 0; oh < output_h; ++oh)
+                {
+                    auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
+                    vst1q_f32(p_out, accum0[oh]);
+                    vst1q_f32(p_out + 4, accum1[oh]);
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_1x1
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+    {
+        const int          input_stride_x  = src->info()->strides_in_bytes().x();
+        const int          input_stride_y  = src->info()->strides_in_bytes().y();
+        const int          input_stride_z  = src->info()->strides_in_bytes().z();
+        const int          output_stride_y = dst->info()->strides_in_bytes().y();
+        const int          output_stride_z = dst->info()->strides_in_bytes().z();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_w        = dst->info()->dimension(0);
+        const int          output_h        = dst->info()->dimension(1);
+        const int          range_z         = window.z().end() - window.z().start();
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
+        Iterator out(dst, window_out);
+        Iterator in(src, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            /*
+                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
+            */
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            for(int oz = 0; oz < range_z; ++oz)
+            {
+                auto p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
+                        }
+                    }
+                }
+
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+template <unsigned int stridex>
+float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
+
+inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
+{
+    const float32x4x3_t m00 =
+    {
+        {
+            vld1q_dup_f32(m0),
+            vld1q_dup_f32(m1),
+            vld1q_dup_f32(m2)
+        }
+    };
+    return m00;
+}
+
+inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
+{
+    const float32x4x2_t m00 =
+    {
+        {
+            vld1q_dup_f32(m3),
+            vld1q_dup_f32(m4)
+        }
+    };
+    return m00;
+}
+
+inline float32x4x3_t load_input(const float *const in)
+{
+    const float32x4x3_t vin =
+    {
+        {
+            vld1q_f32(in),
+            vld1q_f32(in + 4),
+            vld1q_f32(in + 8)
+        }
+    };
+    return vin;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
+{
+    const float32x4x3_t vin0 = load_input(in_0);
+    const float32x4x3_t vin1 = load_input(in_1);
+    const float32x4x3_t vin2 = load_input(in_2);
+    const float32x4x3_t vin3 = load_input(in_3);
+    const float32x4x3_t vin4 = load_input(in_4);
+    const float32x4x3_t m00  = load_matrix_hi(m0, 1 + m0, 2 + m0);
+    const float32x4x2_t m01  = load_matrix_lo(3 + m0, 4 + m0);
+    const float32x4x3_t m10  = load_matrix_hi(m1, 1 + m1, 2 + m1);
+    const float32x4x2_t m11  = load_matrix_lo(3 + m1, 4 + m1);
+    const float32x4x3_t m20  = load_matrix_hi(m2, 1 + m2, 2 + m2);
+    const float32x4x2_t m21  = load_matrix_lo(3 + m2, 4 + m2);
+    const float32x4x3_t m30  = load_matrix_hi(m3, 1 + m3, 2 + m3);
+    const float32x4x2_t m31  = load_matrix_lo(3 + m3, 4 + m3);
+    const float32x4x3_t m40  = load_matrix_hi(m4, 1 + m4, 2 + m4);
+    const float32x4x2_t m41  = load_matrix_lo(3 + m4, 4 + m4);
+
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vin0.val[0], m00.val[0]),
+            vmulq_f32(vin0.val[1], m00.val[0])
+        }
+    };
+
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
+
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
+{
+    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
+{
+    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_3x3
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+        const int          input_stride_x  = src->info()->strides_in_bytes().x();
+        const int          input_stride_y  = src->info()->strides_in_bytes().y();
+        const int          input_stride_z  = src->info()->strides_in_bytes().z();
+        const int          output_stride_y = dst->info()->strides_in_bytes().y();
+        const int          output_stride_z = dst->info()->strides_in_bytes().z();
+        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_w        = dst->info()->dimension(0);
+        const int          output_h        = dst->info()->dimension(1);
+        const int          num_planes_z    = window.z().end() - window.z().start();
+        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(dst, window_out);
+        Iterator in(src, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            /*
+                    Each thread executing this kernel computes one or more output's volume planes.
+
+                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
+                    the third thread [16,24] and the fourth thread [25,31].
+
+                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
+                    is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
+
+                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
+                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
+                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
+            */
+            for(int oz = 0; oz < num_planes_z; ++oz)
+            {
+                const int zoffset    = id.z() + oz;
+                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
+                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
+                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
+                    const uint8_t *input_base = input_ptr + p * input_stride_z;
+                    const auto     ptr_k_r0   = reinterpret_cast<const T1 *>(ptr_k_base);
+                    const auto     ptr_k_r1   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
+                    const auto     ptr_k_r2   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
+                    const auto     vk_r0      = load_matrix_row(ptr_k_r0);
+                    const auto     vk_r1      = load_matrix_row(ptr_k_r1);
+                    const auto     vk_r2      = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_5x5
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+        const int          input_stride_x  = src->info()->strides_in_bytes().x();
+        const int          input_stride_y  = src->info()->strides_in_bytes().y();
+        const int          input_stride_z  = src->info()->strides_in_bytes().z();
+        const int          output_stride_y = dst->info()->strides_in_bytes().y();
+        const int          output_stride_z = dst->info()->strides_in_bytes().z();
+        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_w        = dst->info()->dimension(0);
+        const int          output_h        = dst->info()->dimension(1);
+        const int          num_planes_z    = window.z().end() - window.z().start();
+        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(dst, window_out);
+        Iterator in(src, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            for(int oz = 0; oz < num_planes_z; ++oz)
+            {
+                const int zoffset    = id.z() + oz;
+                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
+                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
+                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
+                            store_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
+
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
+                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
+                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
+                            accumulate_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+float vreduce(const float32x4_t &v)
+{
+    auto v0    = wrapper::vgethigh(v);
+    auto v1    = wrapper::vgetlow(v);
+    auto v_out = wrapper::vadd(v0, v1);
+
+    float a = wrapper::vgetlane(v_out, 0);
+    float b = wrapper::vgetlane(v_out, 1);
+    return a + b;
+}
+
+template <typename T1, typename T2>
+inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        case 2:
+            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        case 3:
+            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <>
+inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                                       const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    if(run_optim_small_tensor(src))
+    {
+        switch(conv_stride_x)
+        {
+            case 1:
+                convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
+                break;
+            case 2:
+                convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
+                break;
+            case 3:
+                convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+    }
+    else
+    {
+        switch(conv_stride_x)
+        {
+            case 1:
+                convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+                break;
+            case 2:
+                convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+                break;
+            case 3:
+                convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+    }
+}
+
+template <typename T1, typename T2>
+inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        case 2:
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        case 3:
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <typename T1, typename T2>
+inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        case 2:
+            convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        case 3:
+            convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+
+    const DataLayout data_layout = src->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
+
+    // Checks performed when output is configured
+    if(dst->total_size() != 0)
+    {
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+        DataType data_type = src->data_type();
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
+                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
+{
+    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+
+    const DataLayout data_layout = src->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+
+    // Calculate right and bottom border
+    unsigned int kernel_size   = weights->dimension(width_idx);
+    const int    conv_stride_x = std::get<0>(conv_info.stride());
+    const int    conv_stride_y = std::get<1>(conv_info.stride());
+    const int    input_width   = src->dimension(width_idx);
+
+    Window win{};
+    bool   window_changed = false;
+
+    if(data_layout == DataLayout::NCHW)
+    {
+        switch(kernel_size)
+        {
+            case 1:
+            {
+                switch(src->data_type())
+                {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    case DataType::F16:
+                        num_elems_written_per_iteration = 8;
+                        break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                    case DataType::F32:
+                        if(run_optim_small_tensor_info(src))
+                        {
+                            num_elems_written_per_iteration = 8;
+                        }
+                        else
+                        {
+                            num_elems_written_per_iteration = 4;
+                        }
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported.");
+                        break;
+                }
+                num_weight_elems_read_per_row = kernel_size;
+                num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
+                break;
+            }
+            case 3:
+                switch(src->data_type())
+                {
+                    case DataType::F32:
+                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
+                        num_elems_read_per_iteration    = 12;
+                        num_elems_written_per_iteration = 16 >> conv_stride_x;
+                        break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    case DataType::F16:
+                        num_weight_elems_read_per_row   = 8 + kernel_size - 1;
+                        num_elems_read_per_iteration    = 24;
+                        num_elems_written_per_iteration = 32 >> conv_stride_x;
+                        break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported.");
+                        break;
+                }
+                break;
+            case 5:
+            {
+                switch(src->data_type())
+                {
+                    case DataType::F32:
+                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
+                        num_elems_read_per_iteration    = 12;
+                        num_elems_written_per_iteration = 16 >> conv_stride_x;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported.");
+                        break;
+                }
+            }
+            break;
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not implemented");
+                break;
+            }
+        }
+
+        // Calculate right pad
+        int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
+        int end_x         = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
+        int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
+
+        // Calculate border
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
+        const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
+        const unsigned int conv_pad_bottom = conv_info.pad_bottom();
+
+        border_size.left   = conv_pad_left;
+        border_size.top    = conv_pad_top;
+        border_size.right  = conv_pad_right;
+        border_size.bottom = conv_pad_bottom;
+
+        // Configure window
+        win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
+
+        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
+                                           num_elems_read_per_iteration, kernel_size,
+                                           conv_stride_x, conv_stride_y);
+        AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
+        AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
+    }
+    else
+    {
+        // Configure window NHWC without any padding
+        win = calculate_max_window(*dst, Steps());
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights)
+{
+    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
+}
+
+} // namespace
+
+template <typename T>
+void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+{
+    // This function assumes that input and weights have not padding in channel
+
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+    const int input_dim_w    = src->info()->dimension(1);
+    const int input_dim_h    = src->info()->dimension(2);
+
+    const int output_stride_c = dst->info()->strides_in_bytes().x();
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
+    const int          kernel_dim_w    = weights->info()->dimension(1);
+    const int          kernel_dim_h    = weights->info()->dimension(2);
+
+    const int conv_pad_top  = _conv_info.pad_top();
+    const int conv_pad_left = _conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(_conv_info.stride());
+    const int conv_stride_h = std::get<1>(_conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+    /*
+     * This implementation parallelize the full WC plane of input and weights by
+     * treating them as series of elements. So for example, a 3x3 weights and
+     * floating point vector operations of 4 elements per time, the first 3
+     * channel elements of the first row would be taken and additionally the first
+     * element of the second row. The 9 elements in each single WC weight plane
+     * would require 2 4-element vector operations and a last single element operation.
+     *
+     * This works since when we create the input vector to multiply with the weights,
+     * the exact required elements are loaded in the same order. Therefore the
+     * multiplication works on the correct input/weight elements.
+     */
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        /*
+         * In here we create theoretical indexes which then we validate for both
+         * inputs and weights.
+         * As a reminder, this loop take each output point in NHW, C is treated
+         * in the weights loop.
+         */
+        // We are computing the theoretical starting input starting points
+        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+        // We are computing the valid initial and ending input points by checking the borders
+        const int in_w_start = std::max(in_w_start_t, 0);
+        const int in_h_start = std::max(in_h_start_t, 0);
+        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+        // We use the input points to select the valid weight points to use
+        const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+        const int index_h_start  = in_h_start - in_h_start_t;
+        const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+        const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
+
+        execute_window_loop(window_w, [&](const Coordinates & id_w)
+        {
+            /*
+             * This is the loop in the weights, and it goes along N (the batches)
+             * As a reminder, the batches of the weights are translated into the
+             * channels of the output
+             */
+            const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
+                                  + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+            const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+            uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
+
+            T out_temp = static_cast<T>(0);
+            for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+            {
+                const T    *in_ptr_mover = in_ptr_row;
+                int         index_wc     = index_wc_start;
+                vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                {
+                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                    const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
+                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                }
+                out_temp += vreduce(out_temp_vec);
+                for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+                {
+                    const auto src_val = *(in_ptr_mover);
+                    const auto w_val   = *(weights_ptr_row + index_wc);
+                    out_temp += src_val * w_val;
+                }
+            }
+            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+        },
+        wei);
+    },
+    out);
+}
+
+template <typename T>
+void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+{
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+    const int input_dim_w    = src->info()->dimension(1);
+    const int input_dim_h    = src->info()->dimension(2);
+
+    const int output_stride_c = dst->info()->strides_in_bytes().x();
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
+    const int          kernel_dim_w    = weights->info()->dimension(1);
+    const int          kernel_dim_h    = weights->info()->dimension(2);
+
+    const int conv_pad_top  = _conv_info.pad_top();
+    const int conv_pad_left = _conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(_conv_info.stride());
+    const int conv_stride_h = std::get<1>(_conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // We are computing the theoretical starting input starting points
+        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+        // We are computing the valid initial and ending input points by checking the borders
+        const int in_w_start = std::max(in_w_start_t, 0);
+        const int in_h_start = std::max(in_h_start_t, 0);
+        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+        // We use the input points to select the valid weight points to use
+        const int wei_w_start = in_w_start - in_w_start_t;
+        const int wei_h_start = in_h_start - in_h_start_t;
+        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+        const int      index_c_end  = weights->info()->dimension(0);
+        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+
+        execute_window_loop(window_w, [&](const Coordinates & id_w)
+        {
+            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+
+            T out_temp = static_cast<T>(0);
+            for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+            {
+                const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
+                const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+                for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                {
+                    const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                    const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                    int         index_c           = 0;
+                    vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                    for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
+                    {
+                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                        const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
+                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                    }
+                    out_temp += vreduce(out_temp_vec);
+                    for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                    {
+                        const auto src_val = *(in_ptr_mover);
+                        const auto w_val   = *(weights_ptr_mover);
+                        out_temp += src_val * w_val;
+                    }
+                }
+            }
+            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+        },
+        wei);
+    },
+    out);
+}
+
+BorderSize CpuDirectConv2dKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    _conv_info   = conv_info;
+    _data_layout = src->data_layout();
+    _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
+
+    const unsigned int conv_pad_left   = conv_info.pad_left();
+    const unsigned int conv_pad_top    = conv_info.pad_top();
+    const unsigned int conv_pad_right  = conv_info.pad_right();
+    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
+    if(_data_layout == DataLayout::NCHW)
+    {
+        _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
+    }
+    else
+    {
+        _border_size = BorderSize(0);
+    }
+
+    // Get convolved dimensions
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+    DataType data_type = src->data_type();
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape, 1, data_type);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
+                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICpuKernel::configure(win_config.second);
+}
+
+Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+{
+    unsigned int num_weight_elems_read_per_row   = 0;
+    unsigned int num_elems_read_per_iteration    = 0;
+    unsigned int num_elems_written_per_iteration = 0;
+    BorderSize   border_size                     = {};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
+                                                              weights->clone().get(),
+                                                              dst->clone().get(),
+                                                              conv_info,
+                                                              num_weight_elems_read_per_row,
+                                                              num_elems_read_per_iteration,
+                                                              num_elems_written_per_iteration,
+                                                              border_size)
+                                .first);
+
+    return Status{};
+}
+
+void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto      src         = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto      weights     = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto      dst         = tensors.get_tensor(TensorType::ACL_DST);
+    const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
+
+    if(_data_layout == DataLayout::NCHW)
+    {
+        switch(kernel_size)
+        {
+            case 1:
+            {
+                switch(src->info()->data_type())
+                {
+                    case DataType::F32:
+                        convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
+                        break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    case DataType::F16:
+                        convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
+                        break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported");
+                        break;
+                }
+                break;
+            }
+            case 3:
+            {
+                switch(src->info()->data_type())
+                {
+                    case DataType::F32:
+                        convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
+                        break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    case DataType::F16:
+                        convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
+                        break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported");
+                        break;
+                }
+                break;
+            }
+            case 5:
+            {
+                switch(src->info()->data_type())
+                {
+                    case DataType::F32:
+                        convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported");
+                        break;
+                }
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
+                break;
+            }
+        }
+    }
+    else
+    {
+        switch(src->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                if(have_zero_x_internal_padding(src->info(), weights->info()))
+                {
+                    convolve_nhwc_optimized<float>(window, src, weights, dst);
+                }
+                else
+                {
+                    convolve_nhwc<float>(window, src, weights, dst);
+                }
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
+    }
+}
+const char *CpuDirectConv2dKernel::name() const
+{
+    return "CpuDirectConvolutionLayerKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
new file mode 100644
index 0000000000..62ed96f255
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform Direct Convolution Layer. */
+class CpuDirectConv2dKernel : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
+    /** Set the src, weights, and dst tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *
+     * @param[in]  src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                       Data type supported:Same as @p input.
+     * @param[out] dst       Output tensor.
+     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+    BorderSize  border_size() const override;
+
+private:
+    /* Template function for optimized convolution NHWC */
+    template <typename T>
+    void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
+
+    /* Template function for convolution NHWC */
+    template <typename T>
+    void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
+
+    PadStrideInfo _conv_info{};
+    BorderSize    _border_size{};
+    unsigned int  _kernel_size{ 0 };
+    unsigned int  _num_weight_elems_read_per_row{ 0 };
+    unsigned int  _num_elems_read_per_iteration{ 0 };
+    unsigned int  _num_elems_written_per_iteration{ 0 };
+    DataLayout    _data_layout{ DataLayout::UNKNOWN };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
new file mode 100644
index 0000000000..662d052941
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                          const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+    }
+
+    if(src->data_type() == DataType::S32)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
+    }
+
+    // Checks performed when output is configured
+    if((dst != nullptr) && (dst->total_size() != 0))
+    {
+        if(is_data_type_float(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+    else if(src->data_type() == DataType::S32)
+    {
+        // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
+        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
+    }
+
+    return Status{};
+}
+
+template <typename T>
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+            auto       v_in   = wrapper::vloadq(in_ptr);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                v_in          = wrapper::vadd(v_in, vb);
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, v_in);
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                s_in += b;
+            }
+
+            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
+        }
+
+    },
+    in, out);
+}
+
+template <typename T>
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+            auto       v_in   = wrapper::vloadq(in_ptr + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            wrapper::vstore(out_ptr + x, v_in);
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                s_in += *bias_ptr;
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            *(out_ptr + x)     = s_in;
+        }
+    },
+    in, bi, out);
+}
+
+// Quantized case
+template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
+void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32x4x4_t v_in =
+            {
+                {
+                    wrapper::vloadq(in_ptr),
+                    wrapper::vloadq(in_ptr + 4),
+                    wrapper::vloadq(in_ptr + 8),
+                    wrapper::vloadq(in_ptr + 12)
+                }
+            };
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                v_in =
+                {
+                    {
+                        wrapper::vadd(v_in.val[0], vb),
+                        wrapper::vadd(v_in.val[1], vb),
+                        wrapper::vadd(v_in.val[2], vb),
+                        wrapper::vadd(v_in.val[3], vb)
+                    }
+                };
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
+                                                           min, max, false));
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                s_in += b;
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+        }
+    },
+    in, out);
+}
+template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
+void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32x4x4_t v_in =
+            {
+                {
+                    wrapper::vloadq(in_ptr),
+                    wrapper::vloadq(in_ptr + 4),
+                    wrapper::vloadq(in_ptr + 8),
+                    wrapper::vloadq(in_ptr + 12),
+                }
+            };
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+
+                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32_t    s_in   = *in_ptr;
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                s_in += *bias_ptr;
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+        }
+    },
+    in, bi, out);
+}
+} // namespace
+
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                 const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(bias);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+    _func                         = nullptr;
+    _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
+    _result_shift                 = info.result_shift;
+    _result_offset_after_shift    = info.result_offset_after_shift;
+
+    // Auto-initialize output output if required
+    if(dst != nullptr)
+    {
+        // Work out expected output data type
+        const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
+    }
+
+    Window win = calculate_max_window(*src, Steps());
+
+    ICpuKernel::configure(win);
+
+    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
+
+    // Set appropriate function
+    if(src->data_layout() == DataLayout::NCHW)
+    {
+        switch(src->data_type())
+        {
+            case DataType::S32:
+            {
+                if(is_qasymm8_signed)
+                {
+                    _func = &output_stage_nchw<int8_t>;
+                }
+                else
+                {
+                    _func = &output_stage_nchw<uint8_t>;
+                }
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                _func = &output_stage_nchw<float16_t>;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+            {
+                _func = &output_stage_nchw<float>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
+        }
+    }
+    else
+    {
+        switch(src->data_type())
+        {
+            case DataType::S32:
+            {
+                if(is_qasymm8_signed)
+                {
+                    _func = &output_stage_nhwc<int8_t>;
+                }
+                else
+                {
+                    _func = &output_stage_nhwc<uint8_t>;
+                }
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                _func = &output_stage_nhwc<float16_t>;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+            {
+                _func = &output_stage_nhwc<float>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
+        }
+    }
+}
+
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+    return Status{};
+}
+
+void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
+}
+
+const char *CpuDirectConv2dOutputStageKernel::name() const
+{
+    return "CpuDirectConv2dOutputStageKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
new file mode 100644
index 0000000000..62bc5d41c9
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
+ *
+ * @note We assume bias to be shared
+ * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
+ *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
+ */
+class CpuDirectConv2dOutputStageKernel : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuDirectConv2dOutputStageKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] src  Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
+     *                      Data type supported: F16/F32/S32
+     * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
+     * @param[out]     dst  (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
+     *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
+     * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+     */
+    void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
+                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv2dOutputStageKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
+                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
+
+    OutputStageKernel *_func{ nullptr };
+    int                _result_fixedpoint_multiplier{ 0 };
+    int                _result_shift{ 0 };
+    int                _result_offset_after_shift{ 0 };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
deleted file mode 100644
index 4f46eb2bf6..0000000000
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
+++ /dev/null
@@ -1,1385 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-
-using namespace arm_compute::detail;
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <unsigned int stridex>
-float16x8_t internal_vld1q(const float16_t *in);
-
-template <>
-float16x8_t internal_vld1q<1>(const float16_t *in)
-{
-    return vld1q_f16(in);
-}
-
-template <>
-float16x8_t internal_vld1q<2>(const float16_t *in)
-{
-    const float16x8x2_t tmp = vld2q_f16(in);
-    return tmp.val[0];
-}
-
-template <>
-float16x8_t internal_vld1q<3>(const float16_t *in)
-{
-    const float16x8x3_t tmp = vld3q_f16(in);
-    return tmp.val[0];
-}
-
-inline float16x8_t internal_vdupq_n(float16_t v)
-{
-    return vdupq_n_f16(v);
-}
-
-inline void internal_vst1q(float16_t *p, const float16x8_t &v)
-{
-    vst1q_f16(p, v);
-}
-
-float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
-{
-    return vmulq_f16(x, y);
-}
-
-inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
-{
-    return vaddq_f16(x, vmulq_f16(y, z));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <unsigned int stridex>
-float32x4_t internal_vld1q(const float *in);
-
-template <>
-float32x4_t internal_vld1q<1>(const float *in)
-{
-    return vld1q_f32(in);
-}
-
-template <>
-float32x4_t internal_vld1q<2>(const float *in)
-{
-    const float32x4x2_t tmp = vld2q_f32(in);
-    return tmp.val[0];
-}
-
-template <>
-float32x4_t internal_vld1q<3>(const float *in)
-{
-    const float32x4x3_t tmp = vld3q_f32(in);
-    return tmp.val[0];
-}
-
-inline float32x4_t internal_vdupq_n(float v)
-{
-    return vdupq_n_f32(v);
-}
-
-inline void internal_vst1q(float *p, const float32x4_t &v)
-{
-    vst1q_f32(p, v);
-}
-
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
-{
-    return vmulq_f32(x, y);
-}
-
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
-{
-    return vmlaq_f32(x, y, z);
-}
-
-constexpr int small_tensor_size_optim = 8;
-inline bool run_optim_small_tensor_info(const ITensorInfo *t)
-{
-    return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
-}
-
-inline bool run_optim_small_tensor(const ITensor *t)
-{
-    return run_optim_small_tensor_info(t->info());
-}
-
-// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
-// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
-// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
-template <unsigned int stridex>
-class convolver_w1x1_i8x8_f32
-{
-public:
-    static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
-
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
-                accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
-                auto p_out_base                                                                               = out_ptr + oz * output_stride_z;
-                for(int p = 0; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk0   = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      v_in0     = internal_vld1q<stridex>(in_val);
-                        auto      v_in1     = internal_vld1q<stridex>(in_val + 4);
-                        accum0[oh]          = vmlaq_f32(accum0[oh], vk0, v_in0);
-                        accum1[oh]          = vmlaq_f32(accum1[oh], vk0, v_in1);
-                    }
-                }
-                for(oh = 0; oh < output_h; ++oh)
-                {
-                    auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
-                    vst1q_f32(p_out, accum0[oh]);
-                    vst1q_f32(p_out + 4, accum1[oh]);
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_1x1
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            /*
-                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
-            */
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                auto p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <unsigned int stridex>
-float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
-
-inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
-{
-    const float32x4x3_t m00 =
-    {
-        {
-            vld1q_dup_f32(m0),
-            vld1q_dup_f32(m1),
-            vld1q_dup_f32(m2)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
-{
-    const float32x4x2_t m00 =
-    {
-        {
-            vld1q_dup_f32(m3),
-            vld1q_dup_f32(m4)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x3_t load_input(const float *const in)
-{
-    const float32x4x3_t vin =
-    {
-        {
-            vld1q_f32(in),
-            vld1q_f32(in + 4),
-            vld1q_f32(in + 8)
-        }
-    };
-    return vin;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    const float32x4x3_t vin0 = load_input(in_0);
-    const float32x4x3_t vin1 = load_input(in_1);
-    const float32x4x3_t vin2 = load_input(in_2);
-    const float32x4x3_t vin3 = load_input(in_3);
-    const float32x4x3_t vin4 = load_input(in_4);
-    const float32x4x3_t m00  = load_matrix_hi(m0, 1 + m0, 2 + m0);
-    const float32x4x2_t m01  = load_matrix_lo(3 + m0, 4 + m0);
-    const float32x4x3_t m10  = load_matrix_hi(m1, 1 + m1, 2 + m1);
-    const float32x4x2_t m11  = load_matrix_lo(3 + m1, 4 + m1);
-    const float32x4x3_t m20  = load_matrix_hi(m2, 1 + m2, 2 + m2);
-    const float32x4x2_t m21  = load_matrix_lo(3 + m2, 4 + m2);
-    const float32x4x3_t m30  = load_matrix_hi(m3, 1 + m3, 2 + m3);
-    const float32x4x2_t m31  = load_matrix_lo(3 + m3, 4 + m3);
-    const float32x4x3_t m40  = load_matrix_hi(m4, 1 + m4, 2 + m4);
-    const float32x4x2_t m41  = load_matrix_lo(3 + m4, 4 + m4);
-
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vin0.val[0], m00.val[0]),
-            vmulq_f32(vin0.val[1], m00.val[0])
-        }
-    };
-
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
-
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_3x3
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            /*
-                    Each thread executing this kernel computes one or more output's volume planes.
-
-                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
-                    the third thread [16,24] and the fourth thread [25,31].
-
-                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
-                    is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
-
-                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
-                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
-                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
-            */
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
-                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
-                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
-                    const uint8_t *input_base = input_ptr + p * input_stride_z;
-                    const auto     ptr_k_r0   = reinterpret_cast<const T1 *>(ptr_k_base);
-                    const auto     ptr_k_r1   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
-                    const auto     ptr_k_r2   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
-                    const auto     vk_r0      = load_matrix_row(ptr_k_r0);
-                    const auto     vk_r1      = load_matrix_row(ptr_k_r1);
-                    const auto     vk_r2      = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_5x5
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            store_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            accumulate_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-float vreduce(const float32x4_t &v)
-{
-    auto v0    = wrapper::vgethigh(v);
-    auto v1    = wrapper::vgetlow(v);
-    auto v_out = wrapper::vadd(v0, v1);
-
-    float a = wrapper::vgetlane(v_out, 0);
-    float b = wrapper::vgetlane(v_out, 1);
-    return a + b;
-}
-
-template <typename T1, typename T2>
-inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <>
-inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                                       const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    if(run_optim_small_tensor(src))
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-    else
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
-
-    // Checks performed when output is configured
-    if(dst->total_size() != 0)
-    {
-        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-        DataType data_type = src->data_type();
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
-                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
-{
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-
-    // Calculate right and bottom border
-    unsigned int kernel_size   = weights->dimension(width_idx);
-    const int    conv_stride_x = std::get<0>(conv_info.stride());
-    const int    conv_stride_y = std::get<1>(conv_info.stride());
-    const int    input_width   = src->dimension(width_idx);
-
-    Window win{};
-    bool   window_changed = false;
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                switch(src->data_type())
-                {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_elems_written_per_iteration = 8;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    case DataType::F32:
-                        if(run_optim_small_tensor_info(src))
-                        {
-                            num_elems_written_per_iteration = 8;
-                        }
-                        else
-                        {
-                            num_elems_written_per_iteration = 4;
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                num_weight_elems_read_per_row = kernel_size;
-                num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
-                break;
-            }
-            case 3:
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_weight_elems_read_per_row   = 8 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 24;
-                        num_elems_written_per_iteration = 32 >> conv_stride_x;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                break;
-            case 5:
-            {
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-            }
-            break;
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not implemented");
-                break;
-            }
-        }
-
-        // Calculate right pad
-        int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
-        int end_x         = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
-        int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
-
-        // Calculate border
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-        const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
-        const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-
-        border_size.left   = conv_pad_left;
-        border_size.top    = conv_pad_top;
-        border_size.right  = conv_pad_right;
-        border_size.bottom = conv_pad_bottom;
-
-        // Configure window
-        win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
-                                           num_elems_read_per_iteration, kernel_size,
-                                           conv_stride_x, conv_stride_y);
-        AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
-        AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-    }
-    else
-    {
-        // Configure window NHWC without any padding
-        win = calculate_max_window(*dst, Steps());
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights)
-{
-    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
-
-} // namespace
-
-template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // This function assumes that input and weights have not padding in channel
-
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-    /*
-     * This implementation parallelize the full WC plane of input and weights by
-     * treating them as series of elements. So for example, a 3x3 weights and
-     * floating point vector operations of 4 elements per time, the first 3
-     * channel elements of the first row would be taken and additionally the first
-     * element of the second row. The 9 elements in each single WC weight plane
-     * would require 2 4-element vector operations and a last single element operation.
-     *
-     * This works since when we create the input vector to multiply with the weights,
-     * the exact required elements are loaded in the same order. Therefore the
-     * multiplication works on the correct input/weight elements.
-     */
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        /*
-         * In here we create theoretical indexes which then we validate for both
-         * inputs and weights.
-         * As a reminder, this loop take each output point in NHW, C is treated
-         * in the weights loop.
-         */
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
-        const int index_h_start  = in_h_start - in_h_start_t;
-        const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
-        const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            /*
-             * This is the loop in the weights, and it goes along N (the batches)
-             * As a reminder, the batches of the weights are translated into the
-             * channels of the output
-             */
-            const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
-                                  + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
-            const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
-            uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
-            {
-                const T    *in_ptr_mover = in_ptr_row;
-                int         index_wc     = index_wc_start;
-                vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
-                {
-                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                    const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
-                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                }
-                out_temp += vreduce(out_temp_vec);
-                for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
-                {
-                    const auto src_val = *(in_ptr_mover);
-                    const auto w_val   = *(weights_ptr_row + index_wc);
-                    out_temp += src_val * w_val;
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(0);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-            {
-                const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
-                const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
-                for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
-                {
-                    const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                    const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                    int         index_c           = 0;
-                    vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
-                    {
-                        const auto src_val = *(in_ptr_mover);
-                        const auto w_val   = *(weights_ptr_mover);
-                        out_temp += src_val * w_val;
-                    }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-BorderSize CpuDirectConvolutionKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    _conv_info   = conv_info;
-    _data_layout = src->data_layout();
-    _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
-    const unsigned int conv_pad_left   = conv_info.pad_left();
-    const unsigned int conv_pad_top    = conv_info.pad_top();
-    const unsigned int conv_pad_right  = conv_info.pad_right();
-    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-    if(_data_layout == DataLayout::NCHW)
-    {
-        _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-    }
-    else
-    {
-        _border_size = BorderSize(0);
-    }
-
-    // Get convolved dimensions
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-    DataType data_type = src->data_type();
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, data_type);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
-                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    unsigned int num_weight_elems_read_per_row   = 0;
-    unsigned int num_elems_read_per_iteration    = 0;
-    unsigned int num_elems_written_per_iteration = 0;
-    BorderSize   border_size                     = {};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
-                                                              weights->clone().get(),
-                                                              dst->clone().get(),
-                                                              conv_info,
-                                                              num_weight_elems_read_per_row,
-                                                              num_elems_read_per_iteration,
-                                                              num_elems_written_per_iteration,
-                                                              border_size)
-                                .first);
-
-    return Status{};
-}
-
-void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto      src         = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto      weights     = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto      dst         = tensors.get_tensor(TensorType::ACL_DST);
-    const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            case 3:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            case 5:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
-                break;
-            }
-        }
-    }
-    else
-    {
-        switch(src->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                if(have_zero_x_internal_padding(src->info(), weights->info()))
-                {
-                    convolve_nhwc_optimized<float>(window, src, weights, dst);
-                }
-                else
-                {
-                    convolve_nhwc<float>(window, src, weights, dst);
-                }
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-}
-const char *CpuDirectConvolutionKernel::name() const
-{
-    return "CpuDirectConvolutionLayerKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h b/src/core/cpu/kernels/CpuDirectConvolutionKernel.h
deleted file mode 100644
index fb8218394b..0000000000
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConvolutionKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuDirectConvolutionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionKernel);
-    /** Set the input, weights, and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *
-     * @param[in]  src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[out] dst       Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionKernel
-     *
-     * @param[in] src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] dst       Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-    BorderSize  border_size() const override;
-
-private:
-    /* Template function for optimized convolution NHWC */
-    template <typename T>
-    void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
-    /* Template function for convolution NHWC */
-    template <typename T>
-    void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
-    PadStrideInfo _conv_info{};
-    BorderSize    _border_size{};
-    unsigned int  _kernel_size{ 0 };
-    unsigned int  _num_weight_elems_read_per_row{ 0 };
-    unsigned int  _num_elems_read_per_iteration{ 0 };
-    unsigned int  _num_elems_written_per_iteration{ 0 };
-    DataLayout    _data_layout{ DataLayout::UNKNOWN };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
deleted file mode 100644
index 5f7a574e5a..0000000000
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                          const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-    }
-
-    if(src->data_type() == DataType::S32)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
-    }
-
-    // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        if(is_data_type_float(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-    else if(src->data_type() == DataType::S32)
-    {
-        // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
-        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
-    }
-
-    return Status{};
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-            auto       v_in   = wrapper::vloadq(in_ptr);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                v_in          = wrapper::vadd(v_in, vb);
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-        }
-
-    },
-    in, out);
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-            auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            wrapper::vstore(out_ptr + x, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            *(out_ptr + x)     = s_in;
-        }
-    },
-    in, bi, out);
-}
-
-// Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12)
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                v_in =
-                {
-                    {
-                        wrapper::vadd(v_in.val[0], vb),
-                        wrapper::vadd(v_in.val[1], vb),
-                        wrapper::vadd(v_in.val[2], vb),
-                        wrapper::vadd(v_in.val[3], vb)
-                    }
-                };
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
-                                                           min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, out);
-}
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12),
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-
-                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32_t    s_in   = *in_ptr;
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, bi, out);
-}
-} // namespace
-
-void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                      const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(bias);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
-
-    _func                         = nullptr;
-    _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
-    _result_shift                 = info.result_shift;
-    _result_offset_after_shift    = info.result_offset_after_shift;
-
-    // Auto-initialize output output if required
-    if(dst != nullptr)
-    {
-        // Work out expected output data type
-        const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
-    }
-
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-
-    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
-
-    // Set appropriate function
-    if(src->data_layout() == DataLayout::NCHW)
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nchw<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nchw<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nchw<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nchw<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-    else
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nhwc<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nhwc<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nhwc<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nhwc<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-}
-
-Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                       const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
-    return Status{};
-}
-
-void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
-}
-
-const char *CpuDirectConvolutionOutputStageKernel::name() const
-{
-    return "CpuDirectConvolutionOutputStageKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
deleted file mode 100644
index 9eeab194cb..0000000000
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @note We assume bias to be shared
- * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
- *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
- */
-class CpuDirectConvolutionOutputStageKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuDirectConvolutionOutputStageKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionOutputStageKernel);
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                      Data type supported: F16/F32/S32
-     * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[out]     dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
-     * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     */
-    void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionOutputStageKernel
-     *
-     * @param[in] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                 Data type supported: F16/F32/S32
-     * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[in] dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                 Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                 Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
-     * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
-
-    OutputStageKernel *_func{ nullptr };
-    int                _result_fixedpoint_multiplier{ 0 };
-    int                _result_shift{ 0 };
-    int                _result_offset_after_shift{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
new file mode 100644
index 0000000000..e6f5890685
--- /dev/null
+++ b/src/core/cpu/kernels/CpuPool2dKernel.cpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuPool2dKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/common/Registrars.h"
+#include "src/core/cpu/kernels/pooling/neon/list.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/ToolchainSupport.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+using namespace misc::shape_calculator;
+
+struct PoolingSelectorData
+{
+    DataType   dt;
+    DataLayout dl;
+    int        pool_stride_x;
+    Size2D     pool_size;
+};
+
+using PoolingSelectorPtr = std::add_pointer<bool(const PoolingSelectorData &data)>::type;
+using PoolingKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+struct PoolingKernel
+{
+    const char              *name;
+    const PoolingSelectorPtr is_selected;
+    PoolingKernelPtr         ukernel;
+};
+
+static const PoolingKernel available_kernels[] =
+{
+    {
+        "poolingMxN_qasymm8_neon_nhwc",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
+    },
+    {
+        "poolingMxN_qasymm8_signed_neon_nhwc",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
+    },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    {
+        "poolingMxN_fp16_neon_nhwc",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
+        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
+    },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+    {
+        "poolingMxN_fp32_neon_nhwc",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
+        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
+    },
+#if defined(ENABLE_NCHW_KERNELS)
+    {
+        "pooling2_qasymm8_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
+    },
+    {
+        "pooling3_qasymm8_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
+    },
+    {
+        "poolingMxN_qasymm8_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
+    },
+    {
+        "pooling2_qasymm8_signed_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
+    },
+    {
+        "pooling3_qasymm8_signed_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
+    },
+    {
+        "poolingMxN_qasymm8_signed_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
+    },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    {
+        "pooling2_fp16_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
+        REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
+    },
+    {
+        "pooling3_fp16_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
+        REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
+    },
+    {
+        "poolingMxN_fp16_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); },
+        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
+    },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+    {
+        "pooling2_fp32_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
+        REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
+    },
+    {
+        "pooling3_fp32_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
+        REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
+    },
+    {
+        "pooling7_fp32_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
+        REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
+    },
+    {
+        "poolingMxN_fp32_neon_nchw",
+        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
+        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
+    },
+#endif /* defined(ENABLE_NCHW_KERNELS) */
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected({ dt, dl, pool_stride_x, pool_size }))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
+                          const ITensorInfo *indices, Size2D pool_size)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0);
+
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    int                 output_width    = 0;
+    int                 output_height   = 0;
+    PoolingType         pool_type       = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    const auto          data_layout     = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                                                     pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+
+    TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    if(indices)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
+                                    && (src->data_layout() == DataLayout::NHWC),
+                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+        if(indices)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
+        }
+    }
+
+    const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size);
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
+                                                        unsigned int &num_elems_processed_per_iteration,
+                                                        BorderSize   &border_size,
+                                                        int pool_size_x, int pool_size_y)
+{
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
+    if(indices)
+    {
+        // Indices auto inizialitation if not yet initialized
+        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
+                                                                                        pool_info)))
+                           .set_data_type(DataType::U32) /* we store the offset to the element */);
+    }
+    const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    unsigned int        num_elems_read_per_iteration = 0;
+    unsigned int        num_elems_horizontal_window  = 0;
+    int                 pool_stride_x                = 0;
+    int                 pool_stride_y                = 0;
+    const int           idx_width                    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height                   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int           src_width                    = src->dimension(idx_width);
+    const int           src_height                   = src->dimension(idx_height);
+    const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info;
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int          pool_pad_right  = pad_stride_info.pad_right();
+    const int          pool_pad_top    = pad_stride_info.pad_top();
+    const int          pool_pad_left   = pad_stride_info.pad_left();
+    const int          pool_pad_bottom = pad_stride_info.pad_bottom();
+    const bool         is_square       = pool_size_x == pool_size_y;
+    const unsigned int pooled_w        = dst->dimension(idx_width);
+    const unsigned int pooled_h        = dst->dimension(idx_height);
+
+    //If it's not squared and optimized will be executed the MxN
+    num_elems_read_per_iteration      = 1;
+    num_elems_processed_per_iteration = 1;
+    num_elems_horizontal_window       = 1;
+
+    if(is_square)
+    {
+        switch(src->data_type())
+        {
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+                switch(pool_size_x)
+                {
+                    case 2:
+                        num_elems_read_per_iteration      = 16;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                        break;
+                    case 3:
+                        num_elems_read_per_iteration      = 16;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+                switch(pool_size_x)
+                {
+                    case 2:
+                    case 3:
+                        num_elems_read_per_iteration      = 4;
+                        num_elems_processed_per_iteration = 1;
+                        num_elems_horizontal_window       = 1;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+                switch(pool_size_x)
+                {
+                    case 2:
+                        num_elems_read_per_iteration = 2;
+                        break;
+                    case 3:
+                        num_elems_read_per_iteration = 4; // We use vload4 for pooling3
+                        break;
+                    case 7:
+                        num_elems_read_per_iteration = 8; // We use vload8 for pooling7
+                        break;
+                    default:
+                        break;
+                }
+                num_elems_processed_per_iteration = 1;
+                num_elems_horizontal_window       = 1;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Element size not supported");
+                break;
+        }
+    }
+
+    bool   window_changed = false;
+    Window win{};
+    if(data_layout == DataLayout::NCHW)
+    {
+        // Number of iterations in X dimension
+        const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+        // Upper limit for the number of right/bottom border elements that are accessed
+        const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
+        border_size             = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+        border_size.right       = std::max(upper_bound_w, pool_pad_right);
+        border_size.bottom      = std::max(upper_bound_h, pool_pad_bottom);
+        TensorShape dst_shape{ src->tensor_shape() };
+        dst_shape.set(0, pooled_w);
+        dst_shape.set(1, pooled_h);
+        TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
+        win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
+        AccessWindowStatic     src_access(src, -pool_pad_left, -pool_pad_top, ceil_to_multiple(src_width + border_size.right, pool_size_x), src_height + border_size.bottom);
+        AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window);
+        if(indices)
+        {
+            AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
+            window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
+        }
+        else
+        {
+            window_changed = update_window_and_padding(win, src_access, dst_access);
+        }
+        dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
+
+        border_size = src->padding();
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+BorderSize CpuPool2dKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
+    const bool          is_global_pooling = pool_info.is_global_pooling;
+
+    // Get data layout
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Update pool size in case of global pooling
+    const Size2D pool_size(
+        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
+
+    // Set instance variables
+    _pool_info     = pool_info;
+    _data_layout   = src->data_layout();
+    _pool_size     = pool_size;
+    _pool_stride_x = pad_stride_info.stride().first;
+
+    if(_data_layout == DataLayout::NHWC)
+    {
+        // Configure kernel window
+        Window win = calculate_max_window(*dst, Steps());
+        ICpuKernel::configure(win);
+    }
+    else
+    {
+        // Configure kernel window
+        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
+                                                        _border_size, pool_size.x(), pool_size.y());
+        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+        ICpuKernel::configure(win_config.second);
+    }
+}
+
+Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+    BorderSize   border_size(0);
+
+    const bool is_global_pooling = pool_info.is_global_pooling;
+
+    // Get data layout
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
+                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size,
+                                                              pool_size_x, pool_size_y)
+                                .first);
+
+    return Status{};
+}
+
+void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const ITensor *src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    ITensor       *dst     = tensors.get_tensor(TensorType::ACL_DST_0);
+    ITensor       *indices = tensors.get_tensor(TensorType::ACL_DST_1);
+
+    const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
+    const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
+    const unsigned int pool_size     = _pool_info.pool_size.width;
+
+    Window window_src(window);
+    if(_data_layout == DataLayout::NCHW)
+    {
+        // Set step for src in x and y direction for the src
+        unsigned int window_x_inc = 0;
+        switch(src->info()->data_type())
+        {
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            {
+                window_x_inc = pool_stride_x;
+                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+                {
+                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+                }
+                break;
+            }
+
+            case DataType::F16:
+            case DataType::F32:
+            {
+                window_x_inc = pool_stride_x;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+        }
+        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
+        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+    }
+    else
+    {
+        window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
+        window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
+        window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
+    }
+
+    const auto *uk = get_implementation(src->info()->data_type(), _data_layout, _pool_stride_x, _pool_size);
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(src, dst, indices, _pool_info, window_src, window);
+}
+
+const char *CpuPool2dKernel::name() const
+{
+    return "CpuPool2dKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
new file mode 100644
index 0000000000..95298004e9
--- /dev/null
+++ b/src/core/cpu/kernels/CpuPool2dKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class CpuPool2dKernel : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuPool2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @note F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in]  src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    BorderSize  border_size() const override;
+    const char *name() const override;
+
+private:
+    PoolingLayerInfo _pool_info{};
+    DataLayout       _data_layout{ DataLayout::UNKNOWN };
+    unsigned int     _num_elems_processed_per_iteration{ 0 };
+    BorderSize       _border_size{ 0 };
+    Size2D           _pool_size{};
+    int              _pool_stride_x{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
deleted file mode 100644
index ccf73883f0..0000000000
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // dst initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
-
-    const bool requantize = src->quantization_info() != dst->quantization_info();
-
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            if(requantize)
-            {
-                create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
-            }
-            break;
-        case DataType::QASYMM8_SIGNED:
-            if(requantize)
-            {
-                create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            create_arm_pooling<float, float>(src, dst, info, cpu_info);
-            break;
-        default:
-            break;
-    }
-
-    Window win = calculate_max_window(*dst, Steps());
-    INEKernel::configure(win);
-}
-
-Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-#ifndef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
-#endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
-                                    "Only AVG and MAX pooling are supported by assembly kernels");
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-        const auto src_qinfo = src->quantization_info().uniform();
-        const auto dst_qinfo = dst->quantization_info().uniform();
-
-        if(src_qinfo != dst_qinfo)
-        {
-            const float multiplier = src_qinfo.scale / dst_qinfo.scale;
-            int32_t     dst_multiplier{};
-            int32_t     dst_shift{};
-            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
-        }
-        else
-        {
-            if(src->data_type() == DataType::QASYMM8)
-            {
-                const bool has_padding = info.pad_stride_info.has_padding();
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
-            }
-        }
-    }
-    else
-    {
-        if(src->data_type() == DataType::QASYMM8)
-        {
-            // If dst is not configured, the quantization info are the same
-            const bool has_padding = info.pad_stride_info.has_padding();
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
-        }
-    }
-    return Status{};
-}
-
-void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(info);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
-    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-
-    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    auto       working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
-
-    const auto src_shape   = src->info()->tensor_shape();
-    const auto dst_shape   = dst->info()->tensor_shape();
-    const auto src_padding = src->info()->padding();
-    const auto dst_padding = dst->info()->padding();
-
-    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
-    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
-    const size_t ld_src_batch = ld_src_row * src_shape[2];
-    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
-    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
-    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
-
-    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
-                         working_space, info.thread_id, info.num_threads);
-}
-
-size_t CpuPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
-{
-    return _kernel_asm->get_working_size(num_threads);
-}
-
-bool CpuPoolingAssemblyWrapperKernel::is_configured() const
-{
-    return _kernel_asm != nullptr;
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
-    // Configure assembly pooling kernel
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
-    const auto src_qinfo = src->quantization_info().uniform();
-    const auto dst_qinfo = dst->quantization_info().uniform();
-
-    const float multiplier = src_qinfo.scale / dst_qinfo.scale;
-    int32_t     dst_multiplier{};
-    int32_t     dst_shift{};
-    quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
-
-    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
-                                                       dst_qinfo.offset,
-                                                       dst_shift, // left shift
-                                                       0,         // right shift
-                                                       dst_multiplier);
-
-    // Configure assembly pooling kernel with requantization
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
deleted file mode 100644
index 34ec452deb..0000000000
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-#include "pool_common.hpp"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific
-  * CPUs like A53 or A55. The arm compute library creates an instance of
-  * CpuPoolingAssemblyWrapperKernel and other auxiliary data structures to
-  * execute a single assembly kernel in the context of an NEFunction.
-  *
-  */
-class CpuPoolingAssemblyWrapperKernel final : public ICpuKernel
-{
-public:
-    /** Constructor
-     */
-    CpuPoolingAssemblyWrapperKernel()                                   = default;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &)  = delete;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &&) = default;
-    CpuPoolingAssemblyWrapperKernel &operator=(CpuPoolingAssemblyWrapperKernel &) = delete;
-
-    const char *name() const override
-    {
-        return "CpuPoolingAssemblyWrapperKernel";
-    }
-
-    /** Initialise the kernel's src and dst.
-     *
-     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst      Destination tensor info to store the result of pooling. Data types supported: same as @p src.
-     * @param[in]  info     Pooling meta-data.
-     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Indicates whether or not this function can be used to process the given parameters.
-     *
-     * @param[in] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst  Destination tensor to store the result of pooling. Data types supported: same as @p src.
-     * @param[in] info Pooling meta-data
-     *
-     * @return a status.
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-    /** Get size of the workspace needed by the assembly kernel.
-     *
-     * @param[in] num_threads Maximum number of threads that are going to be spawned.
-     *
-     * @return size of workspace
-     */
-    size_t get_working_size(unsigned int num_threads) const;
-
-    /** Was the asm kernel successfully configured?
-     *
-     * @return True if the asm kernel is configured and ready to run
-     */
-    bool is_configured() const;
-
-private:
-    /** Helper function to create the assembly kernel.
-     *
-     * @param[in] src  Source tensor info.
-     * @param[in] dst  Destination tensor info.
-     * @param[in] info Pooling layer meta-data.
-     */
-    template <typename Typesrc, typename Typedst>
-    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Helper function to create the assembly kernel with requantization support
-     *
-     * @param[in] src  Source tensor info.
-     * @param[in] dst  Destination tensor info.
-     * @param[in] info Pooling layer meta-data.
-     */
-    template <typename Typesrc, typename Typedst>
-    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.cpp b/src/core/cpu/kernels/CpuPoolingKernel.cpp
deleted file mode 100644
index a55f60d7ad..0000000000
--- a/src/core/cpu/kernels/CpuPoolingKernel.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPoolingKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-#include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using namespace misc::shape_calculator;
-
-struct PoolingSelectorData
-{
-    DataType   dt;
-    DataLayout dl;
-    int        pool_stride_x;
-    Size2D     pool_size;
-};
-
-using PoolingSelectorPtr = std::add_pointer<bool(const PoolingSelectorData &data)>::type;
-using PoolingKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
-struct PoolingKernel
-{
-    const char              *name;
-    const PoolingSelectorPtr is_selected;
-    PoolingKernelPtr         ukernel;
-};
-
-static const PoolingKernel available_kernels[] =
-{
-    {
-        "poolingMxN_qasymm8_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
-    },
-    {
-        "poolingMxN_qasymm8_signed_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "poolingMxN_fp16_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "poolingMxN_fp32_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
-    },
-#if defined(ENABLE_NCHW_KERNELS)
-    {
-        "pooling2_qasymm8_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "pooling3_qasymm8_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "poolingMxN_qasymm8_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "pooling2_qasymm8_signed_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "pooling3_qasymm8_signed_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "poolingMxN_qasymm8_signed_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "pooling2_fp16_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
-    },
-    {
-        "pooling3_fp16_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
-    },
-    {
-        "poolingMxN_fp16_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "pooling2_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
-    },
-    {
-        "pooling3_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
-    },
-    {
-        "pooling7_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
-    },
-    {
-        "poolingMxN_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
-    },
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, dl, pool_stride_x, pool_size }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
-                          const ITensorInfo *indices, Size2D pool_size)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    int                 output_width    = 0;
-    int                 output_height   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const auto          data_layout     = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
-
-    TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    if(indices)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (src->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-        if(indices)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
-        }
-    }
-
-    const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size);
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
-                                                        unsigned int &num_elems_processed_per_iteration,
-                                                        BorderSize   &border_size,
-                                                        int pool_size_x, int pool_size_y)
-{
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
-    if(indices)
-    {
-        // Indices auto inizialitation if not yet initialized
-        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
-                                                                                        pool_info)))
-                           .set_data_type(DataType::U32) /* we store the offset to the element */);
-    }
-    const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    unsigned int        num_elems_read_per_iteration = 0;
-    unsigned int        num_elems_horizontal_window  = 0;
-    int                 pool_stride_x                = 0;
-    int                 pool_stride_y                = 0;
-    const int           idx_width                    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height                   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int           src_width                    = src->dimension(idx_width);
-    const int           src_height                   = src->dimension(idx_height);
-    const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int          pool_pad_right  = pad_stride_info.pad_right();
-    const int          pool_pad_top    = pad_stride_info.pad_top();
-    const int          pool_pad_left   = pad_stride_info.pad_left();
-    const int          pool_pad_bottom = pad_stride_info.pad_bottom();
-    const bool         is_square       = pool_size_x == pool_size_y;
-    const unsigned int pooled_w        = dst->dimension(idx_width);
-    const unsigned int pooled_h        = dst->dimension(idx_height);
-
-    //If it's not squared and optimized will be executed the MxN
-    num_elems_read_per_iteration      = 1;
-    num_elems_processed_per_iteration = 1;
-    num_elems_horizontal_window       = 1;
-
-    if(is_square)
-    {
-        switch(src->data_type())
-        {
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_read_per_iteration      = 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        break;
-                    case 3:
-                        num_elems_read_per_iteration      = 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        break;
-                    default:
-                        break;
-                }
-                break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                switch(pool_size_x)
-                {
-                    case 2:
-                    case 3:
-                        num_elems_read_per_iteration      = 4;
-                        num_elems_processed_per_iteration = 1;
-                        num_elems_horizontal_window       = 1;
-                        break;
-                    default:
-                        break;
-                }
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_read_per_iteration = 2;
-                        break;
-                    case 3:
-                        num_elems_read_per_iteration = 4; // We use vload4 for pooling3
-                        break;
-                    case 7:
-                        num_elems_read_per_iteration = 8; // We use vload8 for pooling7
-                        break;
-                    default:
-                        break;
-                }
-                num_elems_processed_per_iteration = 1;
-                num_elems_horizontal_window       = 1;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Element size not supported");
-                break;
-        }
-    }
-
-    bool   window_changed = false;
-    Window win{};
-    if(data_layout == DataLayout::NCHW)
-    {
-        // Number of iterations in X dimension
-        const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-        // Upper limit for the number of right/bottom border elements that are accessed
-        const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
-        border_size             = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-        border_size.right       = std::max(upper_bound_w, pool_pad_right);
-        border_size.bottom      = std::max(upper_bound_h, pool_pad_bottom);
-        TensorShape dst_shape{ src->tensor_shape() };
-        dst_shape.set(0, pooled_w);
-        dst_shape.set(1, pooled_h);
-        TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
-        win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
-        AccessWindowStatic     src_access(src, -pool_pad_left, -pool_pad_top, ceil_to_multiple(src_width + border_size.right, pool_size_x), src_height + border_size.bottom);
-        AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window);
-        if(indices)
-        {
-            AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
-            window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, src_access, dst_access);
-        }
-        dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-
-        border_size = src->padding();
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize CpuPoolingKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
-    const bool          is_global_pooling = pool_info.is_global_pooling;
-
-    // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Update pool size in case of global pooling
-    const Size2D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
-
-    // Set instance variables
-    _pool_info     = pool_info;
-    _data_layout   = src->data_layout();
-    _pool_size     = pool_size;
-    _pool_stride_x = pad_stride_info.stride().first;
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        // Configure kernel window
-        Window win = calculate_max_window(*dst, Steps());
-        ICpuKernel::configure(win);
-    }
-    else
-    {
-        // Configure kernel window
-        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
-                                                        _border_size, pool_size.x(), pool_size.y());
-        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-        ICpuKernel::configure(win_config.second);
-    }
-}
-
-Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    BorderSize   border_size(0);
-
-    const bool is_global_pooling = pool_info.is_global_pooling;
-
-    // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
-                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size,
-                                                              pool_size_x, pool_size_y)
-                                .first);
-
-    return Status{};
-}
-
-void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const ITensor *src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    ITensor       *dst     = tensors.get_tensor(TensorType::ACL_DST_0);
-    ITensor       *indices = tensors.get_tensor(TensorType::ACL_DST_1);
-
-    const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
-    const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
-    const unsigned int pool_size     = _pool_info.pool_size.width;
-
-    Window window_src(window);
-    if(_data_layout == DataLayout::NCHW)
-    {
-        // Set step for src in x and y direction for the src
-        unsigned int window_x_inc = 0;
-        switch(src->info()->data_type())
-        {
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-            {
-                window_x_inc = pool_stride_x;
-                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
-                {
-                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
-                }
-                break;
-            }
-
-            case DataType::F16:
-            case DataType::F32:
-            {
-                window_x_inc = pool_stride_x;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-        }
-        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
-    }
-    else
-    {
-        window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
-        window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
-        window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
-    }
-
-    const auto *uk = get_implementation(src->info()->data_type(), _data_layout, _pool_stride_x, _pool_size);
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    uk->ukernel(src, dst, indices, _pool_info, window_src, window);
-}
-
-const char *CpuPoolingKernel::name() const
-{
-    return "CpuPoolingKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.h b/src/core/cpu/kernels/CpuPoolingKernel.h
deleted file mode 100644
index 87d8f67119..0000000000
--- a/src/core/cpu/kernels/CpuPoolingKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOLING_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the pooling layer kernel */
-class CpuPoolingKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuPoolingKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPoolingKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in]  src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPoolingKernel
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: Same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    BorderSize  border_size() const override;
-    const char *name() const override;
-
-private:
-    PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 0 };
-    BorderSize       _border_size{ 0 };
-    Size2D           _pool_size{};
-    int              _pool_stride_x{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOLING_KERNEL_H */
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
new file mode 100644
index 0000000000..c78ffb9848
--- /dev/null
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // dst initialization if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
+
+    const bool requantize = src->quantization_info() != dst->quantization_info();
+
+    switch(src->data_type())
+    {
+        case DataType::QASYMM8:
+            if(requantize)
+            {
+                create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
+            }
+            else
+            {
+                create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
+            }
+            break;
+        case DataType::QASYMM8_SIGNED:
+            if(requantize)
+            {
+                create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
+            }
+            else
+            {
+                create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
+            }
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            create_arm_pooling<float, float>(src, dst, info, cpu_info);
+            break;
+        default:
+            break;
+    }
+
+    Window win = calculate_max_window(*dst, Steps());
+    INEKernel::configure(win);
+}
+
+Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+#ifndef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
+#endif /* __aarch64__ */
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
+                                    "Only AVG and MAX pooling are supported by assembly kernels");
+
+    if(dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+        const auto src_qinfo = src->quantization_info().uniform();
+        const auto dst_qinfo = dst->quantization_info().uniform();
+
+        if(src_qinfo != dst_qinfo)
+        {
+            const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+            int32_t     dst_multiplier{};
+            int32_t     dst_shift{};
+            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+        }
+        else
+        {
+            if(src->data_type() == DataType::QASYMM8)
+            {
+                const bool has_padding = info.pad_stride_info.has_padding();
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+            }
+        }
+    }
+    else
+    {
+        if(src->data_type() == DataType::QASYMM8)
+        {
+            // If dst is not configured, the quantization info are the same
+            const bool has_padding = info.pad_stride_info.has_padding();
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+        }
+    }
+    return Status{};
+}
+
+void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
+    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+
+    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+
+    const auto src_shape   = src->info()->tensor_shape();
+    const auto dst_shape   = dst->info()->tensor_shape();
+    const auto src_padding = src->info()->padding();
+    const auto dst_padding = dst->info()->padding();
+
+    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
+    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
+    const size_t ld_src_batch = ld_src_row * src_shape[2];
+    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
+    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
+    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
+
+    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
+                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+                         working_space, info.thread_id, info.num_threads);
+}
+
+size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+{
+    return _kernel_asm->get_working_size(num_threads);
+}
+
+bool CpuPool2dAssemblyWrapperKernel::is_configured() const
+{
+    return _kernel_asm != nullptr;
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+
+    arm_conv::pooling::PoolingWindow window{};
+    window.cols = static_cast<unsigned int>(info.pool_size.x());
+    window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+    arm_conv::pooling::PoolingStride stride{};
+    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+
+    constexpr unsigned int idx_width    = 1;
+    constexpr unsigned int idx_height   = 2;
+    constexpr unsigned int idx_channels = 0;
+    constexpr unsigned int idx_batches  = 3;
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+    // Configure assembly pooling kernel
+    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
+    if(pooling_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    _kernel_asm = std::move(pooling_kernel_asm);
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+
+    arm_conv::pooling::PoolingWindow window{};
+    window.cols = static_cast<unsigned int>(info.pool_size.x());
+    window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+    arm_conv::pooling::PoolingStride stride{};
+    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+
+    constexpr unsigned int idx_width    = 1;
+    constexpr unsigned int idx_height   = 2;
+    constexpr unsigned int idx_channels = 0;
+    constexpr unsigned int idx_batches  = 3;
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+    const auto src_qinfo = src->quantization_info().uniform();
+    const auto dst_qinfo = dst->quantization_info().uniform();
+
+    const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+    int32_t     dst_multiplier{};
+    int32_t     dst_shift{};
+    quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
+
+    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
+                                                       dst_qinfo.offset,
+                                                       dst_shift, // left shift
+                                                       0,         // right shift
+                                                       dst_multiplier);
+
+    // Configure assembly pooling kernel with requantization
+    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+    if(pooling_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    _kernel_asm = std::move(pooling_kernel_asm);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
new file mode 100644
index 0000000000..3afa4c16a4
--- /dev/null
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+#include "pool_common.hpp"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** This class is a wrapper for the assembly kernels.
+  *
+  * Some kernels were written in assembly and highly optimised for specific
+  * CPUs like A53 or A55. The arm compute library creates an instance of
+  * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
+  * execute a single assembly kernel in the context of an NEFunction.
+  *
+  */
+class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
+{
+public:
+    /** Constructor
+     */
+    CpuPool2dAssemblyWrapperKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
+
+    const char *name() const override
+    {
+        return "CpuPool2dAssemblyWrapperKernel";
+    }
+
+    /** Initialise the kernel's src and dst.
+     *
+     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst      Destination tensor info to store the result of pooling. Data types supported: same as @p src.
+     * @param[in]  info     Pooling meta-data.
+     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool2dAssemblyWrapperKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+    /** Get size of the workspace needed by the assembly kernel.
+     *
+     * @param[in] num_threads Maximum number of threads that are going to be spawned.
+     *
+     * @return size of workspace
+     */
+    size_t get_working_size(unsigned int num_threads) const;
+
+    /** Was the asm kernel successfully configured?
+     *
+     * @return True if the asm kernel is configured and ready to run
+     */
+    bool is_configured() const;
+
+private:
+    /** Helper function to create the assembly kernel.
+     *
+     * @param[in] src  Source tensor info.
+     * @param[in] dst  Destination tensor info.
+     * @param[in] info Pooling layer meta-data.
+     */
+    template <typename Typesrc, typename Typedst>
+    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    /** Helper function to create the assembly kernel with requantization support
+     *
+     * @param[in] src  Source tensor info.
+     * @param[in] dst  Destination tensor info.
+     * @param[in] info Pooling layer meta-data.
+     */
+    template <typename Typesrc, typename Typedst>
+    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
new file mode 100644
index 0000000000..2c9a4f301b
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+
+    const DataLayout data_layout = src->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
+                                    && std::get<0>(conv_info.stride()) > 2,
+                                    "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
+
+    if(data_layout == DataLayout::NCHW)
+    {
+        if(is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
+                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+        }
+    }
+
+    if(biases != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of src feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
+                                        "Biases should be one dimensional");
+    }
+
+    // Checks performed when dst is configured
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    const auto data_type = src->data_type();
+    if(is_data_type_quantized(data_type))
+    {
+        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    }
+    return Status{};
+}
+
+inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
+                                                      DataType data_type, DataLayout data_layout)
+{
+    return gpu_target_is_in(gpu_target,
+                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                            GPUTarget::G52, GPUTarget::G52LIT)
+           && (kernel_size <= 5)
+           && (conv_stride_x == 1) && (conv_stride_y == 1)
+           && (data_type == DataType::F32)
+           && (data_layout == DataLayout::NCHW);
+}
+
+inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
+                                 unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
+                                 unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src)
+{
+    const DataType   data_type     = src->data_type();
+    const DataLayout data_layout   = src->data_layout();
+    unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
+    unsigned int     conv_stride_y = std::get<1>(conv_info.stride());
+
+    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
+
+    if(run_optimized_bifrost)
+    {
+        // Configure kernel window
+        switch(kernel_size)
+        {
+            case 1:
+            {
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 4;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 4;
+                break;
+            }
+            case 3:
+            {
+                num_elems_read_per_iteration_x    = 6;
+                num_elems_read_per_iteration_y    = 5;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 3;
+                break;
+            }
+            case 5:
+            {
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_read_per_iteration_y    = 6;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 2;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
+            }
+        }
+    }
+    else
+    {
+        num_elems_read_per_iteration_y    = kernel_size;
+        num_elems_written_per_iteration_x = 8;
+        num_elems_written_per_iteration_y = 1;
+        switch(kernel_size)
+        {
+            case 1:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_x = 8;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_x = 16;
+                        break;
+                    case 3:
+                        switch(src->element_size())
+                        {
+                            case 1:
+                                num_elems_read_per_iteration_x = 28;
+                                break;
+                            case 2:
+                                num_elems_read_per_iteration_x = 24;
+                                break;
+                            case 4:
+                                num_elems_read_per_iteration_x = 22;
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Invalid data size");
+                        }
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
+            case 3:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_x = 10;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_x = 17;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
+            case 5:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_x = 12;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_x = 20;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
+            case 9:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_x = 16;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_x = 24;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid direct convolution size");
+        }
+    }
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target)
+{
+    const DataLayout data_layout = src->data_layout();
+
+    // Get dst shape
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape,
+                       1,
+                       src->data_type(),
+                       src->quantization_info());
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
+        unsigned int       num_rows = 1U;
+        if(dst->tensor_shape()[0] > 16)
+        {
+            num_rows = src->data_type() == DataType::F32 ? 2U : 4U;
+        }
+
+        // Create window and update padding
+        Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+        return std::make_pair(Status{}, win);
+    }
+    else if(data_layout == DataLayout::NCHW)
+    {
+        const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const unsigned int kernel_size = weights->dimension(width_idx);
+
+        unsigned int num_elems_read_per_iteration_x    = 0;
+        unsigned int num_elems_read_per_iteration_y    = 0;
+        unsigned int num_elems_written_per_iteration_x = 0;
+        unsigned int num_elems_written_per_iteration_y = 0;
+
+        unsigned int conv_pad_left = conv_info.pad_left();
+        unsigned int conv_pad_top  = conv_info.pad_top();
+        unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+        unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+
+        setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+                             num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
+                             kernel_size, conv_info, target, src);
+
+        // Create window and update padding
+        bool   window_changed = false;
+        Window win            = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
+        AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
+        AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
+        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        return std::make_pair(err, win);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
+{
+    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
+    {
+        return false;
+    }
+
+    // If not floating point
+    if(!is_data_type_float(tensor->data_type()))
+    {
+        return false;
+    }
+
+    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+    {
+        return false;
+    }
+
+    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
+    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        return false;
+    }
+
+    // Check cl image pitch alignment
+    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
+    {
+        return false;
+    }
+
+    const size_t image_w     = tensor->tensor_shape()[0] / 4;
+    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
+    const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+    const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+    if(image_w > max_image_w || image_h > max_image_h)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace
+
+BorderSize ClDirectConv2dKernel::border_size() const
+{
+    return _border_size;
+}
+
+void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+                                     const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    // Perform validation
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
+                                                  weights,
+                                                  (biases != nullptr) ? biases : nullptr,
+                                                  dst,
+                                                  conv_info));
+
+    const int conv_stride_x = std::get<0>(conv_info.stride());
+    const int conv_stride_y = std::get<1>(conv_info.stride());
+
+    _data_layout = src->data_layout();
+    _conv_info   = conv_info;
+
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int kernel_size = weights->dimension(width_idx);
+    const DataType     data_type   = src->data_type();
+
+    const GPUTarget gpu_target = get_target();
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    std::stringstream kernel_name;
+    CLBuildOptions    build_options;
+
+    if(_data_layout == DataLayout::NHWC)
+    {
+        _border_size = BorderSize();
+
+        kernel_name << "direct_convolution_nhwc";
+
+        const unsigned int n0                 = win_config.second.x().step();
+        const unsigned int m0                 = win_config.second.y().step();
+        const unsigned int k0                 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx));
+        const unsigned int partial_store_n0   = dst->dimension(channel_idx) % n0;
+        const unsigned int pad_left           = conv_info.pad_left();
+        const unsigned int pad_top            = conv_info.pad_top();
+        const bool         export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout);
+
+        // Update the padding for the weights tensor if we can export to cl_image
+        if(export_to_cl_image)
+        {
+            gemm::update_padding_for_cl_image(weights);
+        }
+
+        if(biases != nullptr)
+        {
+            build_options.add_option(std::string("-DHAS_BIAS"));
+            build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
+        }
+
+        build_options.add_option("-cl-fast-relaxed-math");
+        build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
+        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
+        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
+        build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
+        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
+        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx)));
+        build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+        build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
+        build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
+        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+        build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+        build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+        build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+        build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+        if(is_data_type_quantized(data_type))
+        {
+            const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+            const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+            const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+            PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+            int        zero_value_s32;
+            zero_value.get(zero_value_s32);
+
+            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+            int   output_multiplier = 0;
+            int   output_shift      = 0;
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+            build_options.add_option("-DIS_QUANTIZED");
+            build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+            build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+        }
+        else
+        {
+            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
+        }
+    }
+    else
+    {
+        _border_size = BorderSize(src->padding());
+
+        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+
+        build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
+
+        const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout);
+
+        if(run_optimized_for_bifrost)
+        {
+            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+
+            kernel_name << "_f32_bifrost";
+        }
+        else
+        {
+            build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+            build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
+            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+            build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
+            build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
+
+            if(is_data_type_quantized(data_type))
+            {
+                const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+                const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+                const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+                float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+                int   output_multiplier = 0;
+                int   output_shift      = 0;
+                quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+                build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+                build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+                build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
+                build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+                build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+                build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+
+                kernel_name.str("direct_convolution_quantized");
+            }
+        }
+    }
+
+    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name.str();
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(kernel_size);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().left);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().top);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().right);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().bottom);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_stride_x);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_stride_y);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(width_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(height_idx));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(_data_layout));
+}
+
+Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                                      const GPUTarget target)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first);
+
+    return Status{};
+}
+
+void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+
+    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if(_data_layout == DataLayout::NHWC)
+    {
+        cl::Image2D weights_cl_image;
+
+        const size_t dim_y_collapsed    = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step());
+        const bool   export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout);
+
+        slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step()));
+        slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1));
+
+        if(export_to_cl_image)
+        {
+            const size_t      image_w = weights->info()->dimension(0) / 4;
+            const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
+
+            // Export cl_buffer to cl_image
+            weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch);
+        }
+
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src, slice);
+        add_4D_tensor_argument(idx, dst, slice);
+        if(export_to_cl_image)
+        {
+            _kernel.setArg(idx++, weights_cl_image);
+        }
+        add_4D_tensor_argument(idx, weights, slice);
+        if(biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, biases, slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    else
+    {
+        Window win_in = window;
+
+        win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
+        win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);
+
+        const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+        const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+        const int conv_stride_x = std::get<0>(_conv_info.stride());
+        const int conv_stride_y = std::get<1>(_conv_info.stride());
+
+        win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x);
+        win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y);
+
+        Window       slice_in = win_in.first_slice_window_3D();
+        unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
+        add_3D_tensor_argument(idx1, weights, slice);
+
+        if(biases != nullptr)
+        {
+            Window slice_biases;
+            slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
+            add_1D_tensor_argument(idx1, biases, slice_biases);
+        }
+
+        _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
+
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, src, slice_in);
+            add_3D_tensor_argument(idx, dst, slice);
+            enqueue(queue, *this, slice, lws_hint());
+        }
+        while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+    }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
new file mode 100644
index 0000000000..ec76624e5c
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the  direct convolution kernel.
+ */
+class ClDirectConv2dKernel : public IClKernel
+{
+public:
+    ClDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
+    /** Set the src, weights, biases and dst tensors info.
+     *
+     * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
+     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
+     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             The 3rd dimension must be the same as the src's volume 3rd dimension.
+     *                             Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor info. Biases are 1D tensor with dimension [OFM].
+     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+     * @param[out] dst             Output tensor info.
+     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+public:
+    DataLayout    _data_layout{};
+    BorderSize    _border_size{};
+    PadStrideInfo _conv_info{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
deleted file mode 100644
index 0a5101f564..0000000000
--- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
-                                    "Weights feature map dimension should match the respective src's one");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
-                                    && std::get<0>(conv_info.stride()) > 2,
-                                    "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        if(is_data_type_quantized(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
-        }
-    }
-
-    if(biases != nullptr)
-    {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of src feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
-    }
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    const auto data_type = src->data_type();
-    if(is_data_type_quantized(data_type))
-    {
-        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-    }
-    return Status{};
-}
-
-inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
-                                                      DataType data_type, DataLayout data_layout)
-{
-    return gpu_target_is_in(gpu_target,
-                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                            GPUTarget::G52, GPUTarget::G52LIT)
-           && (kernel_size <= 5)
-           && (conv_stride_x == 1) && (conv_stride_y == 1)
-           && (data_type == DataType::F32)
-           && (data_layout == DataLayout::NCHW);
-}
-
-inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
-                                 unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
-                                 unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src)
-{
-    const DataType   data_type     = src->data_type();
-    const DataLayout data_layout   = src->data_layout();
-    unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
-    unsigned int     conv_stride_y = std::get<1>(conv_info.stride());
-
-    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
-    if(run_optimized_bifrost)
-    {
-        // Configure kernel window
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 4;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 4;
-                break;
-            }
-            case 3:
-            {
-                num_elems_read_per_iteration_x    = 6;
-                num_elems_read_per_iteration_y    = 5;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 3;
-                break;
-            }
-            case 5:
-            {
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_read_per_iteration_y    = 6;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 2;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
-            }
-        }
-    }
-    else
-    {
-        num_elems_read_per_iteration_y    = kernel_size;
-        num_elems_written_per_iteration_x = 8;
-        num_elems_written_per_iteration_y = 1;
-        switch(kernel_size)
-        {
-            case 1:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 3:
-                        switch(src->element_size())
-                        {
-                            case 1:
-                                num_elems_read_per_iteration_x = 28;
-                                break;
-                            case 2:
-                                num_elems_read_per_iteration_x = 24;
-                                break;
-                            case 4:
-                                num_elems_read_per_iteration_x = 22;
-                                break;
-                            default:
-                                ARM_COMPUTE_ERROR("Invalid data size");
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 3:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 10;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 17;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 5:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 12;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 20;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 9:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 24;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid direct convolution size");
-        }
-    }
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target)
-{
-    const DataLayout data_layout = src->data_layout();
-
-    // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
-        unsigned int       num_rows = 1U;
-        if(dst->tensor_shape()[0] > 16)
-        {
-            num_rows = src->data_type() == DataType::F32 ? 2U : 4U;
-        }
-
-        // Create window and update padding
-        Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
-        return std::make_pair(Status{}, win);
-    }
-    else if(data_layout == DataLayout::NCHW)
-    {
-        const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const unsigned int kernel_size = weights->dimension(width_idx);
-
-        unsigned int num_elems_read_per_iteration_x    = 0;
-        unsigned int num_elems_read_per_iteration_y    = 0;
-        unsigned int num_elems_written_per_iteration_x = 0;
-        unsigned int num_elems_written_per_iteration_y = 0;
-
-        unsigned int conv_pad_left = conv_info.pad_left();
-        unsigned int conv_pad_top  = conv_info.pad_top();
-        unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-        unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
-        setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                             num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
-                             kernel_size, conv_info, target, src);
-
-        // Create window and update padding
-        bool   window_changed = false;
-        Window win            = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
-        AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
-        AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_pair(err, win);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
-bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
-{
-    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
-    {
-        return false;
-    }
-
-    // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
-    {
-        return false;
-    }
-
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-    {
-        return false;
-    }
-
-    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
-    {
-        return false;
-    }
-
-    // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
-    {
-        return false;
-    }
-
-    const size_t image_w     = tensor->tensor_shape()[0] / 4;
-    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
-    const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
-    const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
-    if(image_w > max_image_w || image_h > max_image_h)
-    {
-        return false;
-    }
-
-    return true;
-}
-
-} // namespace
-
-BorderSize ClDirectConvolutionKernel::border_size() const
-{
-    return _border_size;
-}
-
-void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                          const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  weights,
-                                                  (biases != nullptr) ? biases : nullptr,
-                                                  dst,
-                                                  conv_info));
-
-    const int conv_stride_x = std::get<0>(conv_info.stride());
-    const int conv_stride_y = std::get<1>(conv_info.stride());
-
-    _data_layout = src->data_layout();
-    _conv_info   = conv_info;
-
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int kernel_size = weights->dimension(width_idx);
-    const DataType     data_type   = src->data_type();
-
-    const GPUTarget gpu_target = get_target();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    std::stringstream kernel_name;
-    CLBuildOptions    build_options;
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        _border_size = BorderSize();
-
-        kernel_name << "direct_convolution_nhwc";
-
-        const unsigned int n0                 = win_config.second.x().step();
-        const unsigned int m0                 = win_config.second.y().step();
-        const unsigned int k0                 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx));
-        const unsigned int partial_store_n0   = dst->dimension(channel_idx) % n0;
-        const unsigned int pad_left           = conv_info.pad_left();
-        const unsigned int pad_top            = conv_info.pad_top();
-        const bool         export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout);
-
-        // Update the padding for the weights tensor if we can export to cl_image
-        if(export_to_cl_image)
-        {
-            gemm::update_padding_for_cl_image(weights);
-        }
-
-        if(biases != nullptr)
-        {
-            build_options.add_option(std::string("-DHAS_BIAS"));
-            build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
-        }
-
-        build_options.add_option("-cl-fast-relaxed-math");
-        build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
-        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
-        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
-        build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
-        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
-        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx)));
-        build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-        build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
-        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
-        build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
-        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
-        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
-        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
-        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
-        build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
-        build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
-        build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
-        build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-        if(is_data_type_quantized(data_type))
-        {
-            const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-            const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-            const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-            PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-            int        zero_value_s32;
-            zero_value.get(zero_value_s32);
-
-            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-            int   output_multiplier = 0;
-            int   output_shift      = 0;
-            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-            build_options.add_option("-DIS_QUANTIZED");
-            build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-            build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
-        }
-        else
-        {
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
-            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-        }
-    }
-    else
-    {
-        _border_size = BorderSize(src->padding());
-
-        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
-        build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
-
-        const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout);
-
-        if(run_optimized_for_bifrost)
-        {
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
-
-            kernel_name << "_f32_bifrost";
-        }
-        else
-        {
-            build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
-            build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
-            build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
-            build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-
-            if(is_data_type_quantized(data_type))
-            {
-                const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-                const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-                const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-                float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-                int   output_multiplier = 0;
-                int   output_shift      = 0;
-                quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-                build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-                build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-                build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
-                build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-                build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-                build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-
-                kernel_name.str("direct_convolution_quantized");
-            }
-        }
-    }
-
-    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(kernel_size);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().left);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().top);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().right);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().bottom);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_stride_x);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_stride_y);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(width_idx));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(height_idx));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status ClDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                           const GPUTarget target)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first);
-
-    return Status{};
-}
-
-void ClDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Get initial windows
-    Window slice = window.first_slice_window_3D();
-
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        cl::Image2D weights_cl_image;
-
-        const size_t dim_y_collapsed    = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step());
-        const bool   export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout);
-
-        slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step()));
-        slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1));
-
-        if(export_to_cl_image)
-        {
-            const size_t      image_w = weights->info()->dimension(0) / 4;
-            const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
-            const TensorShape shape2d(image_w, image_h);
-            const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
-
-            // Export cl_buffer to cl_image
-            weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch);
-        }
-
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        if(export_to_cl_image)
-        {
-            _kernel.setArg(idx++, weights_cl_image);
-        }
-        add_4D_tensor_argument(idx, weights, slice);
-        if(biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, biases, slice);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    else
-    {
-        Window win_in = window;
-
-        win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
-        win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);
-
-        const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-        const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-        const int conv_stride_x = std::get<0>(_conv_info.stride());
-        const int conv_stride_y = std::get<1>(_conv_info.stride());
-
-        win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x);
-        win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y);
-
-        Window       slice_in = win_in.first_slice_window_3D();
-        unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
-        add_3D_tensor_argument(idx1, weights, slice);
-
-        if(biases != nullptr)
-        {
-            Window slice_biases;
-            slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
-            add_1D_tensor_argument(idx1, biases, slice_biases);
-        }
-
-        _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice_in);
-            add_3D_tensor_argument(idx, dst, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h
deleted file mode 100644
index 384b561003..0000000000
--- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H
-#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the  direct convolution kernel.
- */
-class ClDirectConvolutionKernel : public IClKernel
-{
-public:
-    ClDirectConvolutionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConvolutionKernel);
-    /** Set the src, weights, biases and dst tensors info.
-     *
-     * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             The 3rd dimension must be the same as the src's volume 3rd dimension.
-     *                             Data type supported:Same as @p src.
-     * @param[in]  biases          Biases tensor info. Biases are 1D tensor with dimension [OFM].
-     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] dst             Output tensor info.
-     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolutionKernel
-     *
-     * @param[in] src       The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the src's volume 3rd dimension.
-     *                      Data type supported:Same as @p src.
-     * @param[in] biases    Biases tensor info. Biases are 1D tensor with dimension [OFM].
-     *                      Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst       Output tensor info.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] target    Target GPU architecture.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    DataLayout    _data_layout{};
-    BorderSize    _border_size{};
-    PadStrideInfo _conv_info{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
new file mode 100644
index 0000000000..0e15bffd14
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+// Internal window config info
+using ClPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
+
+void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info)
+{
+    TensorShape out_shape = compute_pool_shape(*src, pool_info);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+    if(indices)
+    {
+        auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
+    }
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
+                                    "Unsupported combination of parameters!");
+
+    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const bool   is_global_pooling = pool_info.is_global_pooling;
+    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    int          output_width      = 0;
+    int          output_height     = 0;
+    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+
+    // Check indices
+    if(indices)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+
+        if(indices->total_size() != 0)
+        {
+            TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
+        }
+    }
+
+    // Checks performed when dst is configured
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Get data layout
+    const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    int                 pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    int                 pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int  pool_pad_right  = pad_stride_info.pad_right();
+    const int  pool_pad_top    = pad_stride_info.pad_top();
+    const int  pool_pad_left   = pad_stride_info.pad_left();
+    const int  pool_pad_bottom = pad_stride_info.pad_bottom();
+    BorderSize border_size     = BorderSize();
+
+    auto_init(src, dst, indices, pool_info);
+    pooled_w = dst->tensor_shape()[idx_width];
+    pooled_h = dst->tensor_shape()[idx_height];
+
+    const DataType data_type = src->data_type();
+
+    const int src_width  = src->dimension(idx_width);
+    const int src_height = src->dimension(idx_height);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+    bool         window_changed                    = false;
+    Window       win{};
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            // Initialize border size
+            border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+            // Change the number of elements processed per iteration
+            // for pooling 3x3 with stride less equal than 3
+            const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
+            num_elems_processed_per_iteration               = can_optimize ? 4 : 1;
+            const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
+
+            // Number of iterations in X dimension
+            const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+
+            // Upper limit for the number of right/bottom border elements that are accessed
+            const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
+            const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
+
+            border_size.right  = std::max(upper_bound_w, pool_pad_right);
+            border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
+
+            win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+
+            AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
+                                             pool_stride_x, pool_stride_y);
+            AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration);
+
+            // Update indices window
+            if(indices)
+            {
+                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
+                window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
+                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
+            }
+            else
+            {
+                window_changed = update_window_and_padding(win, src_access, dst_access);
+            }
+
+            dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4;
+
+            // Initialize border size
+            border_size                       = BorderSize();
+            num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0));
+            win                               = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size));
+}
+} // namespace
+
+ClPool2dKernel::ClPool2dKernel()
+    : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
+{
+}
+
+BorderSize ClPool2dKernel::border_size() const
+{
+    return _border_size;
+}
+
+void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    auto padding_info = get_padding_info({ src, dst, indices });
+
+    // Set instance variables
+    _pool_info                          = pool_info;
+    _data_layout                        = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    const PoolingType   pool_type       = pool_info.pool_type;
+    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    const bool          exclude_padding = pool_info.exclude_padding;
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int pool_pad_top  = pad_stride_info.pad_top();
+    const int pool_pad_left = pad_stride_info.pad_left();
+
+    // Set build options
+    CLBuildOptions build_opts;
+    const DataType data_type = src->data_type();
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, dst, pool_info, indices);
+
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    ICLKernel::configure_internal(std::get<1>(win_config));
+
+    ClPoolingConfig pooling_config     = std::get<2>(win_config);
+    _num_elems_processed_per_iteration = pooling_config.first;
+    _border_size                       = pooling_config.second;
+
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+
+    // Tensor paddings are used to calculate the indicies for MAX pooling
+    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+    {
+        build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left));
+        build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right));
+        build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top));
+        build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom));
+        build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel)));
+        build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+        build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+    }
+
+    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Check dst dimensions
+    auto_init(src, dst, indices, pool_info);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+    build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+    build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
+    build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+    build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+
+    // Set the initial value for the pooling operation accordingly with the data type
+    if(pool_type == PoolingType::MAX)
+    {
+        if(is_data_type_quantized(data_type))
+        {
+            PixelValue type_min{};
+            std::tie(type_min, std::ignore) = get_min_max(data_type);
+            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
+        }
+        else
+        {
+            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+        }
+    }
+    else
+    {
+        // Pool AVG and Pool L2 initial value
+        build_opts.add_option("-DINITIAL_VALUE=0");
+    }
+
+    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+
+    // Create kernel
+    switch(_data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
+            const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
+            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
+            build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
+            build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+
+            if(pool_type != PoolingType::MAX)
+            {
+                build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+            }
+
+            if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
+            {
+                // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
+                // each thread computes 4 dst elements
+                const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
+
+                std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+                                          + support::cpp11::to_string(pool_size_x);
+                _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+            {
+                // For max pooling with pool2x2, store indicies which will be used in max unpooling
+                if(data_type == DataType::F32)
+                {
+                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+                else if(data_type == DataType::F16)
+                {
+                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+            }
+            else // Run general case
+            {
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            // Floating point mixed precision is support on F16 only
+            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+            // Wider accumulation is required to avoid accuracy loss
+            // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
+            // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
+            DataType acc_data_type = data_type;
+
+            if(use_fp_mixed_precision)
+            {
+                acc_data_type = DataType::F32;
+            }
+            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            {
+                acc_data_type = DataType::S32;
+            }
+
+            build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+            build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
+            build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+            build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+            build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+            build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
+            build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
+            build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
+            {
+                build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
+
+                std::string kernel_name = "pooling_layer_2x2_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            else
+            {
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "pooling_layer_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(_data_layout));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_width));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_height));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(src->data_layout()));
+
+    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
+}
+
+Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info)));
+
+    return Status{};
+}
+
+void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    unsigned int pool_stride_x = 0;
+    unsigned int pool_stride_y = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
+
+    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
+
+    // Collapse window
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    switch(_data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            Window slice = window_collapsed.first_slice_window_3D();
+            do
+            {
+                // Upsample src by pool size
+                Window in_slice(slice);
+                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(),
+                                                             (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x,
+                                                             pool_stride_x * _num_elems_processed_per_iteration));
+                in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(),
+                                                             (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y,
+                                                             pool_stride_y));
+
+                // Set srcs
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, src, in_slice);
+                add_3D_tensor_argument(idx, dst, slice);
+                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_3D_tensor_argument(idx, indices, slice);
+                }
+                enqueue(queue, *this, slice, lws_hint());
+            }
+            while(window_collapsed.slide_window_slice_3D(slice));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3);
+
+            Window slice    = window_collapsed.first_slice_window_4D();
+            Window in_slice = window_collapsed.first_slice_window_4D();
+            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
+            in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
+            in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
+            in_slice.set(3, Window::Dimension(0, batch_size, 1));
+            do
+            {
+                // Set srcs
+                unsigned int idx = 0;
+                add_4D_tensor_argument(idx, src, in_slice);
+                add_4D_tensor_argument(idx, dst, slice);
+                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_4D_tensor_argument(idx, indices, slice);
+                }
+                enqueue(queue, *this, slice, lws_hint());
+            }
+            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.h b/src/core/gpu/cl/kernels/ClPool2dKernel.h
new file mode 100644
index 0000000000..8ecb8eb7b7
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClPool2dKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CL_POOL2D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class ClPool2dKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClPool2dKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel);
+
+    /** Configure kernel for a given list of arguments
+     *
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+public:
+    PoolingLayerInfo _pool_info;
+    DataLayout       _data_layout;
+    BorderSize       _border_size;
+    unsigned int     _num_elems_processed_per_iteration;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp b/src/core/gpu/cl/kernels/ClPoolingKernel.cpp
deleted file mode 100644
index 08a3ce3784..0000000000
--- a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClPoolingKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-// Internal window config info
-using ClPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-
-void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info)
-{
-    TensorShape out_shape = compute_pool_shape(*src, pool_info);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
-    {
-        auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
-
-    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
-    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    int          output_width      = 0;
-    int          output_height     = 0;
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
-
-    // Check indices
-    if(indices)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-
-        if(indices->total_size() != 0)
-        {
-            TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
-        }
-    }
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Get data layout
-    const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
-    int                 pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    int                 pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int  pool_pad_right  = pad_stride_info.pad_right();
-    const int  pool_pad_top    = pad_stride_info.pad_top();
-    const int  pool_pad_left   = pad_stride_info.pad_left();
-    const int  pool_pad_bottom = pad_stride_info.pad_bottom();
-    BorderSize border_size     = BorderSize();
-
-    auto_init(src, dst, indices, pool_info);
-    pooled_w = dst->tensor_shape()[idx_width];
-    pooled_h = dst->tensor_shape()[idx_height];
-
-    const DataType data_type = src->data_type();
-
-    const int src_width  = src->dimension(idx_width);
-    const int src_height = src->dimension(idx_height);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    bool         window_changed                    = false;
-    Window       win{};
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            // Initialize border size
-            border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-            // Change the number of elements processed per iteration
-            // for pooling 3x3 with stride less equal than 3
-            const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
-            num_elems_processed_per_iteration               = can_optimize ? 4 : 1;
-            const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
-
-            // Number of iterations in X dimension
-            const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
-            // Upper limit for the number of right/bottom border elements that are accessed
-            const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
-            const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
-
-            border_size.right  = std::max(upper_bound_w, pool_pad_right);
-            border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
-
-            win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-
-            AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
-                                             pool_stride_x, pool_stride_y);
-            AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration);
-
-            // Update indices window
-            if(indices)
-            {
-                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
-                window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
-                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
-            }
-            else
-            {
-                window_changed = update_window_and_padding(win, src_access, dst_access);
-            }
-
-            dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4;
-
-            // Initialize border size
-            border_size                       = BorderSize();
-            num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0));
-            win                               = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size));
-}
-} // namespace
-
-ClPoolingKernel::ClPoolingKernel()
-    : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
-{
-}
-
-BorderSize ClPoolingKernel::border_size() const
-{
-    return _border_size;
-}
-
-void ClPoolingKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst, indices });
-
-    // Set instance variables
-    _pool_info                          = pool_info;
-    _data_layout                        = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int pool_pad_top  = pad_stride_info.pad_top();
-    const int pool_pad_left = pad_stride_info.pad_left();
-
-    // Set build options
-    CLBuildOptions build_opts;
-    const DataType data_type = src->data_type();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, pool_info, indices);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-    ICLKernel::configure_internal(std::get<1>(win_config));
-
-    ClPoolingConfig pooling_config     = std::get<2>(win_config);
-    _num_elems_processed_per_iteration = pooling_config.first;
-    _border_size                       = pooling_config.second;
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
-
-    // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
-    {
-        build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left));
-        build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right));
-        build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top));
-        build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom));
-        build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel)));
-        build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
-        build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    }
-
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Check dst dimensions
-    auto_init(src, dst, indices, pool_info);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
-    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
-    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
-    build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
-    build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
-    build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
-    build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
-
-    // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
-    {
-        if(is_data_type_quantized(data_type))
-        {
-            PixelValue type_min{};
-            std::tie(type_min, std::ignore) = get_min_max(data_type);
-            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
-        }
-        else
-        {
-            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
-        }
-    }
-    else
-    {
-        // Pool AVG and Pool L2 initial value
-        build_opts.add_option("-DINITIAL_VALUE=0");
-    }
-
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
-
-    // Create kernel
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
-            const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
-            build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
-            build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
-
-            if(pool_type != PoolingType::MAX)
-            {
-                build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            }
-
-            if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
-            {
-                // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
-                // each thread computes 4 dst elements
-                const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
-
-                std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
-                                          + support::cpp11::to_string(pool_size_x);
-                _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
-            {
-                // For max pooling with pool2x2, store indicies which will be used in max unpooling
-                if(data_type == DataType::F32)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-                else if(data_type == DataType::F16)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-            }
-            else // Run general case
-            {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            // Floating point mixed precision is support on F16 only
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
-
-            // Wider accumulation is required to avoid accuracy loss
-            // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
-            // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
-            DataType acc_data_type = data_type;
-
-            if(use_fp_mixed_precision)
-            {
-                acc_data_type = DataType::F32;
-            }
-            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
-            {
-                acc_data_type = DataType::S32;
-            }
-
-            build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
-            build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
-            build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
-            build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-            build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
-            build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
-            build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
-            {
-                build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
-
-                std::string kernel_name = "pooling_layer_2x2_nhwc";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            else
-            {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "pooling_layer_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_width));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_height));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(src->data_layout()));
-
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
-}
-
-Status ClPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info)));
-
-    return Status{};
-}
-
-void ClPoolingKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    unsigned int pool_stride_x = 0;
-    unsigned int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
-    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window_collapsed.first_slice_window_3D();
-            do
-            {
-                // Upsample src by pool size
-                Window in_slice(slice);
-                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(),
-                                                             (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x,
-                                                             pool_stride_x * _num_elems_processed_per_iteration));
-                in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(),
-                                                             (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y,
-                                                             pool_stride_y));
-
-                // Set srcs
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, src, in_slice);
-                add_3D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
-                {
-                    add_3D_tensor_argument(idx, indices, slice);
-                }
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window_collapsed.slide_window_slice_3D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3);
-
-            Window slice    = window_collapsed.first_slice_window_4D();
-            Window in_slice = window_collapsed.first_slice_window_4D();
-            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
-            in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
-            in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
-            in_slice.set(3, Window::Dimension(0, batch_size, 1));
-            do
-            {
-                // Set srcs
-                unsigned int idx = 0;
-                add_4D_tensor_argument(idx, src, in_slice);
-                add_4D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
-                {
-                    add_4D_tensor_argument(idx, indices, slice);
-                }
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.h b/src/core/gpu/cl/kernels/ClPoolingKernel.h
deleted file mode 100644
index c1ce859e2c..0000000000
--- a/src/core/gpu/cl/kernels/ClPoolingKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_POOLING_KERNEL_H
-#define ARM_COMPUTE_CL_POOLING_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the pooling layer kernel */
-class ClPoolingKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClPoolingKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPoolingKernel);
-
-    /** Configure kernel for a given list of arguments
-     *
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClPoolingKernel
-     *
-     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    PoolingLayerInfo _pool_info;
-    DataLayout       _data_layout;
-    BorderSize       _border_size;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_POOLING_KERNEL_H */
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 74867ff64f..907e69d8d7 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -29,17 +29,17 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/runtime/gpu/cl/operators/ClActivation.h"
-#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h"
+#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLDirectConvolutionLayer::Impl
 {
-    const ICLTensor                             *src{ nullptr };
-    const ICLTensor                             *weights{ nullptr };
-    const ICLTensor                             *biases{ nullptr };
-    ICLTensor                                   *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConvolution> op{ nullptr };
+    const ICLTensor                        *src{ nullptr };
+    const ICLTensor                        *weights{ nullptr };
+    const ICLTensor                        *biases{ nullptr };
+    ICLTensor                              *dst{ nullptr };
+    std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
 };
 
 CLDirectConvolutionLayer::CLDirectConvolutionLayer()
@@ -65,14 +65,14 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
     _impl->biases  = biases;
     _impl->dst     = output;
 
-    _impl->op = std::make_unique<opencl::ClDirectConvolution>();
+    _impl->op = std::make_unique<opencl::ClDirectConv2d>();
     _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
 Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    return opencl::ClDirectConvolution::validate(input, weights, biases, output, conv_info, act_info);
+    return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
 }
 
 void CLDirectConvolutionLayer::run()
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index fbaec1d2d9..7ba911c342 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -26,16 +26,16 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPooling.h"
+#include "src/runtime/gpu/cl/operators/ClPool2d.h"
 
 namespace arm_compute
 {
 struct CLPoolingLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    ICLTensor                         *indices{ nullptr };
-    std::unique_ptr<opencl::ClPooling> op{ nullptr };
+    const ICLTensor                  *src{ nullptr };
+    ICLTensor                        *dst{ nullptr };
+    ICLTensor                        *indices{ nullptr };
+    std::unique_ptr<opencl::ClPool2d> op{ nullptr };
 };
 
 CLPoolingLayer::CLPoolingLayer()
@@ -55,13 +55,13 @@ void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTenso
     _impl->dst     = output;
     _impl->indices = indices;
 
-    _impl->op = std::make_unique<opencl::ClPooling>();
+    _impl->op = std::make_unique<opencl::ClPool2d>();
     _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return opencl::ClPooling::validate(input, output, pool_info, indices);
+    return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }
 
 void CLPoolingLayer::run()
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index da9610ef42..a561b88058 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
@@ -47,15 +47,15 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
     const ITensor *biases
     {
         nullptr
-    };                                                                // SRC_2
-    Tensor                                        permuted_input{};   // INT_0
-    Tensor                                        permuted_weights{}; // INT_1
-    Tensor                                        permuted_output{};  // INT_2
-    Tensor                                        workspace{};        // INT_3
-    Tensor                                        packed_weights{};   // INT_4
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
-    bool                                          is_prepared{ false };
-    bool                                          permute{ false };
+    };                                                           // SRC_2
+    Tensor                                   permuted_input{};   // INT_0
+    Tensor                                   permuted_weights{}; // INT_1
+    Tensor                                   permuted_output{};  // INT_2
+    Tensor                                   workspace{};        // INT_3
+    Tensor                                   packed_weights{};   // INT_4
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+    bool                                     is_prepared{ false };
+    bool                                     permute{ false };
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
@@ -80,7 +80,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     _impl->dst     = output;
     _impl->permute = is_nhwc;
 
-    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
                          _impl->dst->info(), info);
@@ -97,7 +97,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     }
     info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
 
-    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConvolutionAssemblyDispatch>();
+    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
     if(is_nhwc)
     {
@@ -154,7 +154,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
                                                                                            const Size2D              &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
@@ -197,17 +197,17 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
 
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
 {
-    Tensor                                        permuted_input{};
-    Tensor                                        permuted_weights{};
-    Tensor                                        permuted_output{};
-    bool                                          is_prepared{ false };
-    bool                                          is_nchw{ false };
-    bool                                          is_activationlayer_enabled{ false };
-    const ITensor                                *weights{ nullptr };
-    const ITensor                                *biases{ nullptr };
-    const ITensor                                *src{ nullptr };
-    ITensor                                      *dst{ nullptr };
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+    Tensor                                   permuted_input{};
+    Tensor                                   permuted_weights{};
+    Tensor                                   permuted_output{};
+    bool                                     is_prepared{ false };
+    bool                                     is_nchw{ false };
+    bool                                     is_activationlayer_enabled{ false };
+    const ITensor                           *weights{ nullptr };
+    const ITensor                           *biases{ nullptr };
+    const ITensor                           *src{ nullptr };
+    ITensor                                 *dst{ nullptr };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -223,7 +223,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
 
     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
 
     _impl->src         = input;
@@ -253,7 +253,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
         output_to_use = &_impl->permuted_output;
     }
 
-    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
     if(_impl->is_nchw)
@@ -273,7 +273,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate
                                                                                  unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
@@ -298,10 +298,10 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 #ifndef DOXYGEN_SKIP_THIS
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    DepthwiseConvolutionFunction                  depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
-    NEDepthwiseConvolutionLayerOptimizedInternal  func_optimized{ nullptr };
-    NEDepthwiseConvolutionLayerGeneric            func_generic{};
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+    DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+    NEDepthwiseConvolutionLayerGeneric           func_generic{};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
 };
 #endif // DOXYGEN_SKIP_THIS
 
@@ -309,7 +309,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
                                             const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op              = std::make_shared<cpu::CpuDepthwiseConvolution>();
+    _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
     _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
                                                                           info);
     switch(_impl->depth_conv_func)
@@ -329,7 +329,7 @@ Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
                                              unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 73834381c6..58530e4a8f 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,17 +27,17 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
+#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
 struct NEDirectConvolutionLayer::Impl
 {
-    ITensor                                   *src{ nullptr };
-    const ITensor                             *weights{ nullptr };
-    const ITensor                             *bias{ nullptr };
-    ITensor                                   *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDirectConvolution> op{ nullptr };
+    ITensor                              *src{ nullptr };
+    const ITensor                        *weights{ nullptr };
+    const ITensor                        *bias{ nullptr };
+    ITensor                              *dst{ nullptr };
+    std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
 };
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -52,14 +52,14 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
     _impl->weights = weights;
     _impl->bias    = bias;
     _impl->dst     = output;
-    _impl->op      = std::make_unique<cpu::CpuDirectConvolution>(_memory_manager);
+    _impl->op      = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
     _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
 }
 
 Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    return cpu::CpuDirectConvolution::validate(input, weights, bias, output, conv_info, act_info);
+    return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
 }
 
 void NEDirectConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 1570cdeedc..bbf3e7cc4e 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -26,17 +26,17 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuPooling.h"
+#include "src/runtime/cpu/operators/CpuPool2d.h"
 
 namespace arm_compute
 {
 struct NEPoolingLayer::Impl
 {
-    ITensor                         *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    ITensor                         *indices{ nullptr };
-    Tensor                           workspace{ nullptr };
-    std::unique_ptr<cpu::CpuPooling> op{ nullptr };
+    ITensor                        *src{ nullptr };
+    ITensor                        *dst{ nullptr };
+    ITensor                        *indices{ nullptr };
+    Tensor                          workspace{ nullptr };
+    std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
 };
 
 NEPoolingLayer::~NEPoolingLayer() = default;
@@ -51,7 +51,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     _impl->src     = input;
     _impl->dst     = output;
     _impl->indices = indices;
-    _impl->op      = std::make_unique<cpu::CpuPooling>();
+    _impl->op      = std::make_unique<cpu::CpuPool2d>();
     _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 
     // Allocate workspace based on kernel's memory requirements
@@ -66,7 +66,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return cpu::CpuPooling::validate(input, output, pool_info, indices);
+    return cpu::CpuPool2d::validate(input, output, pool_info, indices);
 }
 
 void NEPoolingLayer::run()
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..160a9fd70b
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    if(!is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+                                info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+                                info.pad_stride_info.pad_bottom());
+
+    if(biases != nullptr)
+    {
+        const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
+
+    //Validate Activation Layer
+    if(info.act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+    }
+    return Status{};
+}
+} // namespace
+
+CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::CpuDepthwiseConv2dOptimizedInternal()
+    : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo           *src,
+                                                                        const ITensorInfo     *weights,
+                                                                        const ITensorInfo     *biases,
+                                                                        ITensorInfo           *dst,
+                                                                        const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+                                                                             dst, info));
+
+    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+    _has_bias     = biases != nullptr;
+    _is_nchw      = src->data_layout() == DataLayout::NCHW;
+    _permute      = _is_nchw;
+    _is_prepared  = false;
+
+    // Configure pipeline
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(info.act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+    _is_activationlayer_enabled         = info.act_info.enabled() && !(is_relu || is_relu6);
+
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = info.act_info;
+    }
+
+    _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
+    if(_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+        _permute_output  = std::make_unique<cpu::CpuPermute>();
+
+        auto input_perm   = std::make_unique<TensorInfo>();
+        auto weights_perm = std::make_unique<TensorInfo>();
+        auto output_perm  = std::make_unique<TensorInfo>();
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+
+        output_perm->set_data_layout(DataLayout::NHWC);
+        output_perm->set_quantization_info(dst->quantization_info());
+
+        // Configure optimized depthwise
+        _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        output_perm->set_data_layout(DataLayout::NHWC);
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+    }
+    else
+    {
+        _dwc_optimized_func->configure(src, weights, biases, dst, info);
+    }
+
+    // Configure activation
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo     *src,
+                                                                         const ITensorInfo     *weights,
+                                                                         const ITensorInfo     *biases,
+                                                                         const ITensorInfo     *dst,
+                                                                         const ConvolutionInfo &info)
+{
+    return validate_arguments_optimized(src, weights, biases, dst, info);
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    prepare(tensors);
+
+    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst            = tensors.get_tensor(TensorType::ACL_DST_0);
+    auto workspace      = tensors.get_tensor(TensorType::ACL_INT_3);
+    auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+    // Permute input
+    if(_permute)
+    {
+        ITensorPack pack;
+        auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+        auto        src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+    }
+
+    // Run assembly function
+    if(_is_nchw)
+    {
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst_perm);
+        _dwc_optimized_func->run(pack);
+    }
+    else
+    {
+        auto src     = tensors.get_tensor(TensorType::ACL_SRC_0);
+        auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
+        auto dst     = tensors.get_tensor(TensorType::ACL_DST);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _dwc_optimized_func->run(pack);
+    }
+
+    // Permute output
+    if(_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    // Run activation
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+        // Permute weights
+        if(_permute)
+        {
+            auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, weights);
+            pack.add_tensor(TensorType::ACL_DST, permuted_weights);
+            _permute_weights->run(pack);
+
+            weights->mark_as_unused();
+
+            ITensorPack pack_opt;
+            pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+        else
+        {
+            ITensorPack pack_opt;
+            pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+
+        _is_prepared = true;
+    }
+}
+
+CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::CpuDepthwiseConv2dGeneric()
+    : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
+      _is_activationlayer_enabled(false)
+{
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+                                                            dst, info));
+
+    _is_nchw     = src->data_layout() == DataLayout::NCHW;
+    _is_prepared = !_is_nchw;
+
+    ITensorInfo       *input_to_use   = src;
+    const ITensorInfo *weights_to_use = weights;
+    ITensorInfo       *output_to_use  = dst;
+
+    auto input_perm   = std::make_unique<TensorInfo>();
+    auto weights_perm = std::make_unique<TensorInfo>();
+    auto output_perm  = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+
+    if(_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+        input_to_use = input_perm.get();
+
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+        weights_to_use = weights_perm.get();
+
+        output_to_use = output_perm.get();
+    }
+
+    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
+    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
+
+    if(_is_nchw)
+    {
+        _permute_output = std::make_unique<cpu::CpuPermute>();
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+        output_perm->set_data_layout(DataLayout::NHWC);
+    }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = info.act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                               const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    if(src->data_layout() == DataLayout::NCHW)
+    {
+        TensorShape permuted_input_shape   = src->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+        const TensorInfo permuted_input   = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+    }
+
+    // Validate Activation Layer
+    if(info.act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
+{
+    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    if(_is_nchw)
+    {
+        prepare(tensors);
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+
+        ITensorPack pack_depth;
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+    }
+    else
+    {
+        ITensorPack pack_depth;
+        pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
+        pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+    }
+
+    if(_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        ARM_COMPUTE_ERROR_ON(!weights->is_used());
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, weights);
+        pack.add_tensor(TensorType::ACL_DST, weights_perm);
+
+        _permute_weights->run(pack);
+        weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+
+CpuDepthwiseConv2d::CpuDepthwiseConv2d()
+    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
+{
+}
+
+void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.configure(src, weights, biases, dst, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.configure(src, weights, biases, dst, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
+    switch(depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                                                   const ConvolutionInfo &info)
+{
+    if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+    {
+        return DepthwiseConvolutionFunction::OPTIMIZED;
+    }
+    else
+    {
+        return DepthwiseConvolutionFunction::GENERIC;
+    }
+}
+
+void CpuDepthwiseConv2d::run(ITensorPack &tensors)
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.run(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.run(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+
+void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.prepare(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.prepare(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
new file mode 100644
index 0000000000..049397fe60
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to execute a depthwise convolution.
+ */
+class CpuDepthwiseConv2d : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2d();
+    /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
+     *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      info    Depthwise convolution meta-data.
+     */
+    void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
+     *
+     * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                    Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                    Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] dst     Destination tensor. Data type supported: same as @p src.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return a Depthwise Convolution Function
+     */
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                                          const ConvolutionInfo &info);
+
+    // Inherited methods overriden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+
+private:
+    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
+    *
+    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+    *
+    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+    * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
+    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required
+    * -# @ref NEActivationLayer if fused activation is required
+    *
+    */
+    class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dOptimizedInternal();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dOptimizedInternal() = default;
+        /** Initialize the function's source, destination, kernels and border_size.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+        // Inherited methods overriden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                      _activationlayer_function{ nullptr };
+        bool                                                _has_bias{ false };
+        bool                                                _is_quantized{ false };
+        bool                                                _is_nchw{ true };
+        bool                                                _permute{ false };
+        bool                                                _is_activationlayer_enabled{ false };
+        bool                                                _is_prepared{ false };
+    };
+
+    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
+     *
+     * -# @ref CpuDepthwiseConv2dNativeKernel
+     *
+     */
+    class CpuDepthwiseConv2dGeneric : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dGeneric();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dGeneric() = default;
+        /** Initialize the function's source, destination, weights and convolution information.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dGeneric::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+        // Inherited methods overridden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
+        bool                                                     _is_nchw{ true };
+        bool                                                     _is_prepared{ false };
+        bool                                                     _is_activationlayer_enabled{ false };
+    };
+
+    DepthwiseConvolutionFunction        _depth_conv_func;
+    CpuDepthwiseConv2dOptimizedInternal _func_optimized;
+    CpuDepthwiseConv2dGeneric           _func_generic;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
new file mode 100644
index 0000000000..a36ee1d45b
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
+                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
+                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
+                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
+                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                                  neon_convolution_kernels::ActivationFunction activation,
+                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
+                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
+                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
+                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
+                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *src,
+                                                                   const ITensorInfo     *weights,
+                                                                   ITensorInfo           *output,
+                                                                   const ConvolutionInfo &info)
+{
+    const DataType    data_type = src->data_type();
+    const TensorShape shape     = src->tensor_shape();
+
+    const int n_batches       = shape[3];
+    const int in_rows         = shape.z();
+    const int in_cols         = shape.y();
+    const int n_channels      = shape.x();
+    const int dilation_factor = info.dilation.x();
+    const int padding_top     = info.pad_stride_info.pad_top();
+    const int padding_left    = info.pad_stride_info.pad_left();
+    const int padding_bottom  = info.pad_stride_info.pad_bottom();
+    const int padding_right   = info.pad_stride_info.pad_right();
+
+    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
+    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+
+    const unsigned int stride_x    = info.pad_stride_info.stride().first;
+    const unsigned int kernel_size = weights->tensor_shape().y();
+
+    // Map activation function
+    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
+    if(arm_compute::utils::info_helpers::is_relu(info.act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU;
+    }
+    else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
+    }
+
+    // Create quantized convolver
+    if(is_uniform_quantized)
+    {
+        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
+        const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
+        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
+        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int32_t     qmultiplier = 0;
+        int32_t     qshift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
+
+        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
+                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+    }
+    else if(is_perchannel_quantized)
+    {
+        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
+        const QuantizationInfo        weights_qinfo = weights->quantization_info();
+        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
+        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        std::vector<float>   fmultipliers;
+        std::vector<int32_t> qmultipliers;
+        std::vector<int32_t> qshifts;
+
+        for(auto const s : wqinfo.scales)
+        {
+            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
+            int32_t     qmultiplier = 0;
+            int32_t     qshift      = 0;
+            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+            fmultipliers.push_back(fmultipler);
+            qmultipliers.push_back(qmultiplier);
+            qshifts.push_back(qshift);
+        }
+
+        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
+
+        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
+                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+    }
+    else
+    {
+        // Create float convolver
+        switch(data_type)
+        {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+            }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F32:
+            {
+                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+            }
+            default:
+                return nullptr;
+        }
+    }
+}
+} // namespace
+
+struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
+{
+    std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
+    NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
+    bool                                              is_prepared{ false };
+    experimental::MemoryRequirements                  mem_req{};
+};
+
+#ifndef DOXYGEN_SKIP_THIS
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
+    : _pImpl(std::make_unique<LocalImpl>())
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default;
+
+void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
+                                                   const ITensorInfo     *weights,
+                                                   const ITensorInfo     *bias,
+                                                   ITensorInfo           *dst,
+                                                   const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_UNUSED(bias);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src,
+                                                                            weights,
+                                                                            bias != nullptr ? bias : nullptr,
+                                                                            dst,
+                                                                            info));
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info()));
+
+    _pImpl->is_prepared = false;
+
+    // Create convolver
+    _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info);
+    ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
+
+    // Create assembly kernel wrapper
+    _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
+
+    constexpr size_t alignment = 128;
+
+    // Create workspace
+    const unsigned int num_threads    = NEScheduler::get().num_threads();
+    const size_t       workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
+    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
+
+    // Create packing tensor
+    const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
+    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
+
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
+}
+
+experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
+{
+    return _pImpl->mem_req;
+}
+
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
+                                                    const ITensorInfo     *weights,
+                                                    const ITensorInfo     *bias,
+                                                    const ITensorInfo     *dst,
+                                                    const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+
+    // Validate convolver
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info));
+
+    // Validate activation
+    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
+    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
+
+    // Check bias
+    if(bias != nullptr)
+    {
+        unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    // Check output
+    if(dst->total_size() != 0)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    // The uniform quantization case will only have 1 scale value in the weights quantization info
+    const UniformQuantizationInfo src_qinfo     = src->quantization_info().uniform();
+    const QuantizationInfo        weights_qinfo = weights->quantization_info();
+    const UniformQuantizationInfo dst_qinfo     = dst->quantization_info().uniform();
+    for(auto const s : weights_qinfo.scale())
+    {
+        const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale;
+        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
+    }
+
+    return Status{};
+}
+
+bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo     *src,
+                                                                const ITensorInfo     *weights,
+                                                                const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+
+    // Reshape input shape if in NHWC format
+    const DataLayout data_layout = src->data_layout();
+    TensorShape      in_shape{ src->tensor_shape() };
+    if(data_layout == DataLayout::NHWC)
+    {
+        in_shape.set(Window::DimX, src->tensor_shape().y());
+        in_shape.set(Window::DimY, src->tensor_shape().z());
+        in_shape.set(Window::DimZ, src->tensor_shape().x());
+    }
+
+    // Check data type
+    const DataType input_type            = src->data_type();
+    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
+    const DataType weights_type          = weights->data_type();
+    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
+                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
+
+    // Check weighs size
+    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
+    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int     kernel_w               = weights->dimension(width_idx);
+    const unsigned int     kernel_h               = weights->dimension(height_idx);
+    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
+
+    // Check for supported strides
+    const auto &strides           = info.pad_stride_info.stride();
+    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+    // Check for supported padding
+    const auto    pad_top           = info.pad_stride_info.pad_top();
+    const auto    pad_right         = info.pad_stride_info.pad_right();
+    const auto    pad_bottom        = info.pad_stride_info.pad_bottom();
+    const auto    pad_left          = info.pad_stride_info.pad_left();
+    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
+    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+    bool          supported_padding = is_same_padding || is_valid_padding;
+    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
+    bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
+
+    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
+    {
+        is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
+    }
+
+    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
+{
+    // Prepare assembly kernel
+    prepare(tensors);
+
+    auto src       = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+    auto dst       = tensors.get_tensor(TensorType::ACL_DST);
+
+    // Setup inputs/outputs
+    ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
+    _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
+
+    ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
+    const int   input_element_size = src->info()->element_size();
+    const int   input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
+    const int   input_row_stride   = src->info()->strides_in_bytes().z() / input_element_size;
+    const int   input_col_stride   = src->info()->strides_in_bytes().y() / input_element_size;
+    const void *input_ptr          = src->buffer() + src->info()->offset_first_element_in_bytes();
+    _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+    ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
+    const int output_element_size = dst->info()->element_size();
+    const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
+    const int output_row_stride   = dst->info()->strides_in_bytes().z() / output_element_size;
+    const int output_col_stride   = dst->info()->strides_in_bytes().y() / output_element_size;
+    void     *output_ptr          = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+
+    // Schedule assembly kernel
+    NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
+{
+    if(!_pImpl->is_prepared)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
+
+        // Pack weights and bias
+        const int weights_element_size = weights->info()->element_size();
+        const int weights_row_stride   = weights->info()->strides_in_bytes().z() / weights_element_size;
+        const int weights_col_stride   = weights->info()->strides_in_bytes().y() / weights_element_size;
+        _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
+                                                 weights->buffer() + weights->info()->offset_first_element_in_bytes(),
+                                                 weights_row_stride,
+                                                 weights_col_stride,
+                                                 (bias != nullptr) ? bias->buffer() : nullptr);
+        _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
+
+        weights->mark_as_unused();
+        if(bias != nullptr)
+        {
+            bias->mark_as_unused();
+        }
+        _pImpl->is_prepared = true;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
new file mode 100644
index 0000000000..195942b7fd
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
+
+#include "src/core/common/Macros.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Depthwise convolution assembly kernel glue */
+class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2dAssemblyDispatch();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
+    /** Default destructor */
+    ~CpuDepthwiseConv2dAssemblyDispatch();
+
+    /** Initialize the function's source, destination, kernels and border_size.
+     *
+     * @note Supports only NHWC format
+     *
+     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src.
+     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: Same as @p src.
+     * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
+     * @param[in]  info    Depthwise convolution meta-data.
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Check if the optimized kernel can be used for the given kernel sizes and strides
+     *
+     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
+     *
+     * @param[in] src     Input tensor info.
+     * @param[in] weights Weights tensor info.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
+     */
+    static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    struct LocalImpl;
+    std::unique_ptr<LocalImpl> _pImpl;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
deleted file mode 100644
index 6d097280e0..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > input->dimension(idx_w) + info.pad_stride_info.pad_left() +
-                                info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > input->dimension(idx_h) + info.pad_stride_info.pad_top() +
-                                info.pad_stride_info.pad_bottom());
-
-    if(biases != nullptr)
-    {
-        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, info));
-
-    //Validate Activation Layer
-    if(info.act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
-    }
-    return Status{};
-}
-} // namespace
-
-CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::CpuDepthwiseConvolutionOptimizedInternal()
-    : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
-      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
-{
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configure(ITensorInfo           *input,
-                                                                                  const ITensorInfo     *weights,
-                                                                                  const ITensorInfo     *biases,
-                                                                                  ITensorInfo           *output,
-                                                                                  const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, (biases == nullptr) ? nullptr : biases,
-                                                                                  output, info));
-
-    _is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    _has_bias     = biases != nullptr;
-    _is_nchw      = input->data_layout() == DataLayout::NCHW;
-    _permute      = _is_nchw;
-    _is_prepared  = false;
-
-    // Configure pipeline
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(info.act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(info.act_info);
-    _is_activationlayer_enabled         = info.act_info.enabled() && !(is_relu || is_relu6);
-
-    if(!_is_activationlayer_enabled)
-    {
-        act_info_to_use = info.act_info;
-    }
-
-    _dwc_optimized_func = std::make_unique<CpuDepthwiseConvolutionAssemblyDispatch>();
-    if(_is_nchw)
-    {
-        _permute_input   = std::make_unique<cpu::CpuPermute>();
-        _permute_weights = std::make_unique<cpu::CpuPermute>();
-        _permute_output  = std::make_unique<cpu::CpuPermute>();
-
-        auto input_perm   = std::make_unique<TensorInfo>();
-        auto weights_perm = std::make_unique<TensorInfo>();
-        auto output_perm  = std::make_unique<TensorInfo>();
-
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
-        input_perm->set_data_layout(DataLayout::NHWC);
-
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
-        weights_perm->set_data_layout(DataLayout::NHWC);
-
-        output_perm->set_data_layout(DataLayout::NHWC);
-        output_perm->set_quantization_info(output->quantization_info());
-
-        // Configure optimized depthwise
-        _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
-
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        output_perm->set_data_layout(DataLayout::NHWC);
-        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
-    }
-    else
-    {
-        _dwc_optimized_func->configure(input, weights, biases, output, info);
-    }
-
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(output, nullptr, info.act_info);
-    }
-}
-
-Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::validate(const ITensorInfo     *input,
-                                                                                   const ITensorInfo     *weights,
-                                                                                   const ITensorInfo     *biases,
-                                                                                   const ITensorInfo     *output,
-                                                                                   const ConvolutionInfo &info)
-{
-    return validate_arguments_optimized(input, weights, biases, output, info);
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    prepare(tensors);
-
-    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst            = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto workspace      = tensors.get_tensor(TensorType::ACL_INT_3);
-    auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
-    // Permute input
-    if(_permute)
-    {
-        ITensorPack pack;
-        auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-        auto        src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
-        pack.add_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, src_perm);
-        _permute_input->run(pack);
-    }
-
-    // Run assembly function
-    if(_is_nchw)
-    {
-        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
-        pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
-        pack.add_tensor(TensorType::ACL_SRC_2, bias);
-        pack.add_tensor(TensorType::ACL_INT_0, workspace);
-        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
-        pack.add_tensor(TensorType::ACL_DST, dst_perm);
-        _dwc_optimized_func->run(pack);
-    }
-    else
-    {
-        auto src     = tensors.get_tensor(TensorType::ACL_SRC_0);
-        auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
-        auto dst     = tensors.get_tensor(TensorType::ACL_DST);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, src);
-        pack.add_tensor(TensorType::ACL_SRC_1, weights);
-        pack.add_tensor(TensorType::ACL_SRC_2, bias);
-        pack.add_tensor(TensorType::ACL_INT_0, workspace);
-        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _dwc_optimized_func->run(pack);
-    }
-
-    // Permute output
-    if(_is_nchw)
-    {
-        ITensorPack pack;
-        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output->run(pack);
-    }
-
-    // Run activation
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
-        // Permute weights
-        if(_permute)
-        {
-            auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC, weights);
-            pack.add_tensor(TensorType::ACL_DST, permuted_weights);
-            _permute_weights->run(pack);
-
-            weights->mark_as_unused();
-
-            ITensorPack pack_opt;
-            pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
-            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
-            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
-            // Prepare optimized function
-            _dwc_optimized_func->prepare(pack_opt);
-        }
-        else
-        {
-            ITensorPack pack_opt;
-            pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
-            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
-            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
-            // Prepare optimized function
-            _dwc_optimized_func->prepare(pack_opt);
-        }
-
-        _is_prepared = true;
-    }
-}
-
-CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::CpuDepthwiseConvolutionGeneric()
-    : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
-      _is_activationlayer_enabled(false)
-{
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolution::validate(input, weights, (biases == nullptr) ? nullptr : biases,
-                                                                 output, info));
-
-    _is_nchw     = input->data_layout() == DataLayout::NCHW;
-    _is_prepared = !_is_nchw;
-
-    ITensorInfo       *input_to_use   = input;
-    const ITensorInfo *weights_to_use = weights;
-    ITensorInfo       *output_to_use  = output;
-
-    auto input_perm   = std::make_unique<TensorInfo>();
-    auto weights_perm = std::make_unique<TensorInfo>();
-    auto output_perm  = std::make_unique<TensorInfo>(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
-
-    if(_is_nchw)
-    {
-        _permute_input   = std::make_unique<cpu::CpuPermute>();
-        _permute_weights = std::make_unique<cpu::CpuPermute>();
-
-        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
-        input_perm->set_data_layout(DataLayout::NHWC);
-        input_to_use = input_perm.get();
-
-        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
-        weights_perm->set_data_layout(DataLayout::NHWC);
-        weights_to_use = weights_perm.get();
-
-        output_to_use = output_perm.get();
-    }
-
-    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
-    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
-
-    if(_is_nchw)
-    {
-        _permute_output = std::make_unique<cpu::CpuPermute>();
-        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
-        output_perm->set_data_layout(DataLayout::NHWC);
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = info.act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(output, nullptr, info.act_info);
-    }
-}
-
-Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                         const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
-        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(input, weights, biases, output, info));
-    }
-
-    // Validate Activation Layer
-    if(info.act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
-    }
-
-    return Status{};
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::run(ITensorPack &tensors)
-{
-    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
-
-    if(_is_nchw)
-    {
-        prepare(tensors);
-        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, src_perm);
-        _permute_input->run(pack);
-
-        ITensorPack pack_depth;
-        pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
-        pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
-        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
-        pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
-    }
-    else
-    {
-        ITensorPack pack_depth;
-        pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
-        pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
-        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
-        pack_depth.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
-    }
-
-    if(_is_nchw)
-    {
-        ITensorPack pack;
-        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output->run(pack);
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-
-        ARM_COMPUTE_ERROR_ON(!weights->is_used());
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, weights);
-        pack.add_tensor(TensorType::ACL_DST, weights_perm);
-
-        _permute_weights->run(pack);
-        weights->mark_as_unused();
-        _is_prepared = true;
-    }
-}
-
-CpuDepthwiseConvolution::CpuDepthwiseConvolution()
-    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
-{
-}
-
-void CpuDepthwiseConvolution::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
-{
-    _depth_conv_func = get_depthwiseconvolution_function(input, weights, (biases != nullptr) ? biases : nullptr, output, info);
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.configure(input, weights, biases, output, info);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.configure(input, weights, biases, output, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-Status CpuDepthwiseConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
-{
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, info);
-    switch(depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            return CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            return CpuDepthwiseConvolutionGeneric::validate(input, weights, biases, output, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-DepthwiseConvolutionFunction CpuDepthwiseConvolution::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                        const ConvolutionInfo &info)
-{
-    if(bool(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info)))
-    {
-        return DepthwiseConvolutionFunction::OPTIMIZED;
-    }
-    else
-    {
-        return DepthwiseConvolutionFunction::GENERIC;
-    }
-}
-
-void CpuDepthwiseConvolution::run(ITensorPack &tensors)
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.run(tensors);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.run(tensors);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
-
-void CpuDepthwiseConvolution::prepare(ITensorPack &tensors)
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.prepare(tensors);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.prepare(tensors);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
deleted file mode 100644
index e39cb7db4d..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZATION_H
-#define ARM_COMPUTE_CPU_DEQUANTIZATION_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to execute a depthwise convolution.
- */
-class CpuDepthwiseConvolution : public ICpuOperator
-{
-public:
-    /** Default constructor */
-    CpuDepthwiseConvolution();
-    /** Initialize the function's source, destination, weights and convolution information.
-     *
-     * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
-     *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      info    Depthwise convolution meta-data.
-     */
-    void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution
-     *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in] weights Weights tensor info. These are 3D tensors info with shape [kernel_x, kernel_y, IFM].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConvolution
-     *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor. Data type supported: same as @p input.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a Depthwise Convolution Function
-     */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                          const ConvolutionInfo &info);
-
-    // Inherited methods overriden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-
-private:
-    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
-    *
-    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
-    *
-    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
-    * -# @ref CpuDepthwiseConvolution3x3Kernel if 3x3 and no assembly kernel implementation is present
-    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
-    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
-    * -# @ref NEActivationLayer if fused activation is required
-    *
-    */
-    class CpuDepthwiseConvolutionOptimizedInternal : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConvolutionOptimizedInternal();
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionOptimizedInternal(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConvolutionOptimizedInternal(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionOptimizedInternal &operator=(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConvolutionOptimizedInternal &operator=(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConvolutionOptimizedInternal() = default;
-        /** Initialize the function's source, destination, kernels and border_size.
-         *
-         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution3x3
-         *
-         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in] info    Depthwise convolution meta-data.
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-        // Inherited methods overriden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<CpuDepthwiseConvolutionAssemblyDispatch> _dwc_optimized_func{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
-        bool                                                     _has_bias{ false };
-        bool                                                     _is_quantized{ false };
-        bool                                                     _is_nchw{ true };
-        bool                                                     _permute{ false };
-        bool                                                     _is_activationlayer_enabled{ false };
-        bool                                                     _is_prepared{ false };
-    };
-
-    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
-     *
-     * -# @ref CpuDepthwiseConvolutionNativeKernel
-     *
-     */
-    class CpuDepthwiseConvolutionGeneric : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConvolutionGeneric();
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionGeneric(const CpuDepthwiseConvolutionGeneric &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConvolutionGeneric(CpuDepthwiseConvolutionGeneric &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionGeneric &operator=(const CpuDepthwiseConvolutionGeneric &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConvolutionGeneric &operator=(CpuDepthwiseConvolutionGeneric &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConvolutionGeneric() = default;
-        /** Initialize the function's source, destination, weights and convolution information.
-         *
-         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionGeneric
-         *
-         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] info    Depthwise convolution meta-data.
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-        // Inherited methods overridden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<kernels::CpuDepthwiseConvolutionNativeKernel> _depthwise_conv_kernel{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                                _activationlayer_function{ nullptr };
-        bool                                                          _is_nchw{ true };
-        bool                                                          _is_prepared{ false };
-        bool                                                          _is_activationlayer_enabled{ false };
-    };
-
-    DepthwiseConvolutionFunction             _depth_conv_func;
-    CpuDepthwiseConvolutionOptimizedInternal _func_optimized;
-    CpuDepthwiseConvolutionGeneric           _func_generic;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZATION_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
deleted file mode 100644
index 039714abb1..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
-#include "src/core/helpers/AutoConfiguration.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
-                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
-                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
-                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                                  neon_convolution_kernels::ActivationFunction activation,
-                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
-                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *input,
-                                                                   const ITensorInfo     *weights,
-                                                                   ITensorInfo           *output,
-                                                                   const ConvolutionInfo &info)
-{
-    const DataType    data_type = input->data_type();
-    const TensorShape shape     = input->tensor_shape();
-
-    const int n_batches       = shape[3];
-    const int in_rows         = shape.z();
-    const int in_cols         = shape.y();
-    const int n_channels      = shape.x();
-    const int dilation_factor = info.dilation.x();
-    const int padding_top     = info.pad_stride_info.pad_top();
-    const int padding_left    = info.pad_stride_info.pad_left();
-    const int padding_bottom  = info.pad_stride_info.pad_bottom();
-    const int padding_right   = info.pad_stride_info.pad_right();
-
-    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
-    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-
-    const unsigned int stride_x    = info.pad_stride_info.stride().first;
-    const unsigned int kernel_size = weights->tensor_shape().y();
-
-    // Map activation function
-    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
-    if(arm_compute::utils::info_helpers::is_relu(info.act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU;
-    }
-    else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
-    }
-
-    // Create quantized convolver
-    if(is_uniform_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-        const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
-        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
-        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int32_t     qmultiplier = 0;
-        int32_t     qshift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
-
-        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
-                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else if(is_perchannel_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-        const QuantizationInfo        weights_qinfo = weights->quantization_info();
-        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
-        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        std::vector<float>   fmultipliers;
-        std::vector<int32_t> qmultipliers;
-        std::vector<int32_t> qshifts;
-
-        for(auto const s : wqinfo.scales)
-        {
-            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
-            int32_t     qmultiplier = 0;
-            int32_t     qshift      = 0;
-            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-            fmultipliers.push_back(fmultipler);
-            qmultipliers.push_back(qmultiplier);
-            qshifts.push_back(qshift);
-        }
-
-        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
-
-        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
-                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else
-    {
-        // Create float convolver
-        switch(data_type)
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F32:
-            {
-                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-            default:
-                return nullptr;
-        }
-    }
-}
-} // namespace
-
-struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl
-{
-    std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
-    NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
-    bool                                              is_prepared{ false };
-    experimental::MemoryRequirements                  mem_req{};
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch()
-    : _pImpl(std::make_unique<LocalImpl>())
-{
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch() = default;
-
-void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo     *input,
-                                                        const ITensorInfo     *weights,
-                                                        const ITensorInfo     *bias,
-                                                        ITensorInfo           *output,
-                                                        const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(bias);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionAssemblyDispatch::validate(input,
-                                                                                 weights,
-                                                                                 bias != nullptr ? bias : nullptr,
-                                                                                 output,
-                                                                                 info));
-
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
-    _pImpl->is_prepared = false;
-
-    // Create convolver
-    _pImpl->dwc_assembly_kernel = create_convolver(input, weights, output, info);
-    ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
-
-    // Create assembly kernel wrapper
-    _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
-
-    constexpr size_t alignment = 128;
-
-    // Create workspace
-    const unsigned int num_threads    = NEScheduler::get().num_threads();
-    const size_t       workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
-
-    // Create packing tensor
-    const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
-    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
-
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
-}
-
-experimental::MemoryRequirements CpuDepthwiseConvolutionAssemblyDispatch::workspace() const
-{
-    return _pImpl->mem_req;
-}
-
-Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo     *input,
-                                                         const ITensorInfo     *weights,
-                                                         const ITensorInfo     *bias,
-                                                         const ITensorInfo     *output,
-                                                         const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-    // Validate convolver
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, info));
-
-    // Validate activation
-    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
-    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
-
-    // Check bias
-    if(bias != nullptr)
-    {
-        unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    // Check output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    // The uniform quantization case will only have 1 scale value in the weights quantization info
-    const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-    const QuantizationInfo        weights_qinfo = weights->quantization_info();
-    const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-    for(auto const s : weights_qinfo.scale())
-    {
-        const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
-        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
-    }
-
-    return Status{};
-}
-
-bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo     *input,
-                                                                     const ITensorInfo     *weights,
-                                                                     const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-
-    // Reshape input shape if in NHWC format
-    const DataLayout data_layout = input->data_layout();
-    TensorShape      in_shape{ input->tensor_shape() };
-    if(data_layout == DataLayout::NHWC)
-    {
-        in_shape.set(Window::DimX, input->tensor_shape().y());
-        in_shape.set(Window::DimY, input->tensor_shape().z());
-        in_shape.set(Window::DimZ, input->tensor_shape().x());
-    }
-
-    // Check data type
-    const DataType input_type            = input->data_type();
-    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
-    const DataType weights_type          = weights->data_type();
-    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
-                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
-
-    // Check weighs size
-    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
-    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int     kernel_w               = weights->dimension(width_idx);
-    const unsigned int     kernel_h               = weights->dimension(height_idx);
-    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
-
-    // Check for supported strides
-    const auto &strides           = info.pad_stride_info.stride();
-    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
-    // Check for supported padding
-    const auto    pad_top           = info.pad_stride_info.pad_top();
-    const auto    pad_right         = info.pad_stride_info.pad_right();
-    const auto    pad_bottom        = info.pad_stride_info.pad_bottom();
-    const auto    pad_left          = info.pad_stride_info.pad_left();
-    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
-    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
-    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
-    bool          supported_padding = is_same_padding || is_valid_padding;
-    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-    bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
-
-    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
-    {
-        is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
-    }
-
-    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
-}
-
-void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors)
-{
-    // Prepare assembly kernel
-    prepare(tensors);
-
-    auto src       = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-    auto dst       = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Setup inputs/outputs
-    ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
-    _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
-
-    ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
-    const int   input_element_size = src->info()->element_size();
-    const int   input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
-    const int   input_row_stride   = src->info()->strides_in_bytes().z() / input_element_size;
-    const int   input_col_stride   = src->info()->strides_in_bytes().y() / input_element_size;
-    const void *input_ptr          = src->buffer() + src->info()->offset_first_element_in_bytes();
-    _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
-
-    ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
-    const int output_element_size = dst->info()->element_size();
-    const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
-    const int output_row_stride   = dst->info()->strides_in_bytes().z() / output_element_size;
-    const int output_col_stride   = dst->info()->strides_in_bytes().y() / output_element_size;
-    void     *output_ptr          = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
-
-    // Schedule assembly kernel
-    NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
-}
-
-void CpuDepthwiseConvolutionAssemblyDispatch::prepare(ITensorPack &tensors)
-{
-    if(!_pImpl->is_prepared)
-    {
-        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
-        ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
-
-        // Pack weights and bias
-        const int weights_element_size = weights->info()->element_size();
-        const int weights_row_stride   = weights->info()->strides_in_bytes().z() / weights_element_size;
-        const int weights_col_stride   = weights->info()->strides_in_bytes().y() / weights_element_size;
-        _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
-                                                 weights->buffer() + weights->info()->offset_first_element_in_bytes(),
-                                                 weights_row_stride,
-                                                 weights_col_stride,
-                                                 (bias != nullptr) ? bias->buffer() : nullptr);
-        _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
-
-        weights->mark_as_unused();
-        if(bias != nullptr)
-        {
-            bias->mark_as_unused();
-        }
-        _pImpl->is_prepared = true;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
deleted file mode 100644
index 6aac74c3ef..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Depthwise convolution assembly kernel glue */
-class CpuDepthwiseConvolutionAssemblyDispatch : public ICpuOperator
-{
-public:
-    CpuDepthwiseConvolutionAssemblyDispatch();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuDepthwiseConvolutionAssemblyDispatch(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    CpuDepthwiseConvolutionAssemblyDispatch(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuDepthwiseConvolutionAssemblyDispatch &operator=(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move assignment operator */
-    CpuDepthwiseConvolutionAssemblyDispatch &operator=(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Default destructor */
-    ~CpuDepthwiseConvolutionAssemblyDispatch();
-    /** Initialize the function's source, destination, kernels and border_size.
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input.
-     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]  info    Depthwise convolution meta-data.
-     */
-    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionAssemblyDispatch
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input.
-     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]  info    Depthwise convolution meta-data.
-     *
-     * @return An error status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const ConvolutionInfo &info);
-    /** Check if the optimized kernel can be used for the given kernel sizes and strides
-     *
-     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
-     *
-     * @param[in] input   Input tensor info.
-     * @param[in] weights Weights tensor info.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
-     */
-    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo &info);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    struct LocalImpl;
-    std::unique_ptr<LocalImpl> _pImpl;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
new file mode 100644
index 0000000000..8812b777a3
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuDirectConv2d::~CpuDirectConv2d() = default;
+
+CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
+      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+{
+}
+
+void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>();
+    _conv_kernel          = std::make_unique<kernels::CpuDirectConv2dKernel>();
+    _input_border_handler = std::make_unique<NEFillBorderKernel>();
+
+    // Free accumulator
+    if(_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
+    // Check if bias should be added in the convolution result
+    _has_bias = (bias != nullptr);
+
+    _conv_kernel->configure(src, weights, dst, conv_info);
+    if(_has_bias)
+    {
+        _output_stage_kernel->configure(dst, bias);
+    }
+    _is_padding_required = !_conv_kernel->border_size().empty();
+
+    if(_is_padding_required)
+    {
+        // Add zero padding XY
+        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+    }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<CpuActivation>();
+        _activationlayer_function->configure(dst, dst, act_info);
+    }
+}
+
+Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+
+    // output might not be initialized since it can be an intermediate tensor of another layer
+    DataType   data_type = src->data_type();
+    TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+    // Validate Convolution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of input feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // Validate bias kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
+
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDirectConv2d::run(ITensorPack &tensors)
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    if(_is_padding_required)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_DST, src);
+        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+    }
+    NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
+    if(_has_bias)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, dst);
+        pack.add_tensor(TensorType::ACL_SRC_1, bias);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
+    }
+
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.h b/src/runtime/cpu/operators/CpuDirectConv2d.h
new file mode 100644
index 0000000000..9e584b9c49
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDirectConv2d.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref kernels::CpuDirectConv2dOutputStageKernel
+ * -# @ref kernels::CpuDirectConv2dKernel
+ */
+class CpuDirectConv2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Destructor */
+    ~CpuDirectConv2d();
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+     *
+     * @param[in, out] src       Input tensor info. Data types supported: F16/F32.
+     * @param[in]      weights   Set of kernels to convolve the input volume.
+     *                           Supported sizes: 1x1, 3x3 and 5x5.
+     *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                           Data type supported: Same as @p src.
+     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
+     * @param[out]     dst       Output tensor info.
+     *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    MemoryGroup                                                _memory_group;
+    std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel;
+    std::unique_ptr<kernels::CpuDirectConv2dKernel>            _conv_kernel;
+    std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
+    std::unique_ptr<CpuActivation>                             _activationlayer_function;
+    Tensor                                                     _accumulator;
+    bool                                                       _has_bias{ false };
+    bool                                                       _is_activationlayer_enabled{ false };
+    unsigned int                                               _dim_split{ 0 };
+    bool                                                       _is_padding_required{ false };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.cpp b/src/runtime/cpu/operators/CpuDirectConvolution.cpp
deleted file mode 100644
index 33f79603e8..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConvolution.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuDirectConvolution::~CpuDirectConvolution() = default;
-
-CpuDirectConvolution::CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
-{
-}
-
-void CpuDirectConvolution::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConvolutionOutputStageKernel>();
-    _conv_kernel          = std::make_unique<kernels::CpuDirectConvolutionKernel>();
-    _input_border_handler = std::make_unique<NEFillBorderKernel>();
-
-    // Free accumulator
-    if(_accumulator.buffer() != nullptr)
-    {
-        _accumulator.allocator()->free();
-    }
-
-    _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
-
-    // Check if bias should be added in the convolution result
-    _has_bias = (bias != nullptr);
-
-    _conv_kernel->configure(src, weights, dst, conv_info);
-    if(_has_bias)
-    {
-        _output_stage_kernel->configure(dst, bias);
-    }
-    _is_padding_required = !_conv_kernel->border_size().empty();
-
-    if(_is_padding_required)
-    {
-        // Add zero padding XY
-        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<CpuActivation>();
-        _activationlayer_function->configure(dst, dst, act_info);
-    }
-}
-
-Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // output might not be initialized since it can be an intermediate tensor of another layer
-    DataType   data_type = src->data_type();
-    TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
-
-    // Validate Convolution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionKernel::validate(src, weights, &accumulator, conv_info));
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of input feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
-    }
-
-    // Validate bias kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionOutputStageKernel::validate(&accumulator, bias, dst));
-
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
-    }
-
-    return Status{};
-}
-
-void CpuDirectConvolution::run(ITensorPack &tensors)
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_is_padding_required)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_DST, src);
-        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
-    }
-    NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
-    if(_has_bias)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, dst);
-        pack.add_tensor(TensorType::ACL_SRC_1, bias);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.h b/src/runtime/cpu/operators/CpuDirectConvolution.h
deleted file mode 100644
index 0635e087fd..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConvolution.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to run the direct convolution.
- *
- *  This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel for the input
- * -# @ref kernels::CpuDirectConvolutionOutputStageKernel
- * -# @ref kernels::CpuDirectConvolutionKernel
- */
-class CpuDirectConvolution : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Destructor */
-    ~CpuDirectConvolution();
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
-     *
-     * @param[in, out] src       Input tensor info. Data types supported: F16/F32.
-     * @param[in]      weights   Set of kernels to convolve the input volume.
-     *                           Supported sizes: 1x1, 3x3 and 5x5.
-     *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                           Data type supported: Same as @p src.
-     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
-     * @param[out]     dst       Output tensor info.
-     *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
-     *
-     * @param[in] src       Input tensor info. Data types supported: F16/F32.
-     * @param[in] weights   Set of kernels to convolve the input volume.
-     *                      Supported sizes: 1x1, 3x3 and 5x5.
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported: Same as @p src.
-     * @param[in] bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
-     * @param[in] dst       Output tensor info.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    MemoryGroup                                                     _memory_group;
-    std::unique_ptr<kernels::CpuDirectConvolutionOutputStageKernel> _output_stage_kernel;
-    std::unique_ptr<kernels::CpuDirectConvolutionKernel>            _conv_kernel;
-    std::unique_ptr<NEFillBorderKernel>                             _input_border_handler;
-    std::unique_ptr<CpuActivation>                                  _activationlayer_function;
-    Tensor                                                          _accumulator;
-    bool                                                            _has_bias{ false };
-    bool                                                            _is_activationlayer_enabled{ false };
-    unsigned int                                                    _dim_split{ 0 };
-    bool                                                            _is_padding_required{ false };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H */
diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp
new file mode 100644
index 0000000000..b225199c40
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPool2d.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuPool2d.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/cpu/kernels/CpuPool2dKernel.h"
+#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPool2d::CpuPool2d()
+    : _pooling_layer_kernel(),
+      _border_handler(),
+      _asm_glue(),
+      _is_global_pooling_layer(false),
+      _data_layout(DataLayout::NCHW),
+      _mem_req()
+{
+}
+
+CpuPool2d::~CpuPool2d() = default;
+
+void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
+    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+    // Get data layout
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+
+    // Check if we have Global Pooling Layer
+    const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
+
+    if(run_optimised)
+    {
+        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+        const unsigned int num_threads = NEScheduler::get().num_threads();
+
+        auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
+        ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
+        pooling_wrapper->configure(src, dst, pool_info, ci);
+
+        // Get kernel's memory requirements
+        constexpr size_t alignment      = 4096;
+        const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
+        _mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
+
+        _asm_glue = std::move(pooling_wrapper);
+    }
+    else
+    {
+        // Configure pooling kernel
+        auto k = std::make_unique<kernels::CpuPool2dKernel>();
+        k->configure(src, dst, pool_info, indices);
+        _pooling_layer_kernel = std::move(k);
+
+        switch(_data_layout)
+        {
+            case DataLayout::NCHW:
+            {
+                // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+                BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+                PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
+                if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding)
+                {
+                    zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+                }
+                auto b = std::make_unique<NEFillBorderKernel>();
+                b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value);
+                _border_handler = std::move(b);
+                break;
+            }
+            case DataLayout::NHWC:
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data layout not supported");
+        }
+    }
+}
+
+Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+    if(run_optimised)
+    {
+        return Status{};
+    }
+
+    return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices);
+}
+
+void CpuPool2d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
+
+    if(_asm_glue)
+    {
+        const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
+        NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
+    }
+    else
+    {
+        switch(_data_layout)
+        {
+            case DataLayout::NCHW:
+                // Fill border
+                NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
+
+                // Run pooling layer
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+                break;
+            case DataLayout::NHWC:
+                // Run pooling layer
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data layout not supported");
+        }
+    }
+}
+
+experimental::MemoryRequirements CpuPool2d::workspace() const
+{
+    return _mem_req;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h
new file mode 100644
index 0000000000..ae3d115dfc
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPool2d.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_H
+#define ARM_COMPUTE_CPU_POOL2D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward Declarations
+struct PoolingLayerInfo;
+
+namespace cpu
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref kernels::CpuPool2dKernel
+ * -# @ref kernels::CpuPool2dAssemblyWrapperKernel
+ */
+class CpuPool2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuPool2d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d);
+    /** Default destructor */
+    ~CpuPool2d();
+    /** Set the src and dst tensors.
+     *
+     * @note F16 is supported for pool sizes 2 and 3 only
+     *
+     * @param[in, out] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]     dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    std::unique_ptr<INEKernel> _pooling_layer_kernel;
+    std::unique_ptr<INEKernel> _border_handler;
+    std::unique_ptr<INEKernel> _asm_glue;
+
+    bool                             _is_global_pooling_layer;
+    DataLayout                       _data_layout;
+    experimental::MemoryRequirements _mem_req;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL2D_H */
diff --git a/src/runtime/cpu/operators/CpuPooling.cpp b/src/runtime/cpu/operators/CpuPooling.cpp
deleted file mode 100644
index 3a6ac24a74..0000000000
--- a/src/runtime/cpu/operators/CpuPooling.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuPooling.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
-#include "src/core/cpu/kernels/CpuPoolingKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuPooling::CpuPooling()
-    : _pooling_layer_kernel(),
-      _border_handler(),
-      _asm_glue(),
-      _is_global_pooling_layer(false),
-      _data_layout(DataLayout::NCHW),
-      _mem_req()
-{
-}
-
-CpuPooling::~CpuPooling() = default;
-
-void CpuPooling::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
-    // Get data layout
-    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-
-    // Check if we have Global Pooling Layer
-    const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
-
-    if(run_optimised)
-    {
-        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-        const unsigned int num_threads = NEScheduler::get().num_threads();
-
-        auto pooling_wrapper = std::make_unique<kernels::CpuPoolingAssemblyWrapperKernel>();
-        ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
-        pooling_wrapper->configure(src, dst, pool_info, ci);
-
-        // Get kernel's memory requirements
-        constexpr size_t alignment      = 4096;
-        const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
-        _mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
-
-        _asm_glue = std::move(pooling_wrapper);
-    }
-    else
-    {
-        // Configure pooling kernel
-        auto k = std::make_unique<kernels::CpuPoolingKernel>();
-        k->configure(src, dst, pool_info, indices);
-        _pooling_layer_kernel = std::move(k);
-
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-            {
-                // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-                BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-                PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
-                if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding)
-                {
-                    zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-                }
-                auto b = std::make_unique<NEFillBorderKernel>();
-                b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value);
-                _border_handler = std::move(b);
-                break;
-            }
-            case DataLayout::NHWC:
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
-}
-
-Status CpuPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
-    if(run_optimised)
-    {
-        return Status{};
-    }
-
-    return kernels::CpuPoolingKernel::validate(src, dst, pool_info, indices);
-}
-
-void CpuPooling::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
-
-    if(_asm_glue)
-    {
-        const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
-        NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
-    }
-    else
-    {
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-                // Fill border
-                NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
-
-                // Run pooling layer
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
-                break;
-            case DataLayout::NHWC:
-                // Run pooling layer
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
-}
-
-experimental::MemoryRequirements CpuPooling::workspace() const
-{
-    return _mem_req;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPooling.h b/src/runtime/cpu/operators/CpuPooling.h
deleted file mode 100644
index bc30adf762..0000000000
--- a/src/runtime/cpu/operators/CpuPooling.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOLING_H
-#define ARM_COMPUTE_CPU_POOLING_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward Declarations
-struct PoolingLayerInfo;
-
-namespace cpu
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref kernels::CpuPoolingKernel
- * -# @ref kernels::CpuPoolingAssemblyWrapperKernel
- */
-class CpuPooling : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuPooling();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuPooling(const CpuPooling &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuPooling &operator=(const CpuPooling &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuPooling(CpuPooling &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuPooling &operator=(CpuPooling &&) = delete;
-    /** Default destructor */
-    ~CpuPooling();
-    /** Set the src and dst tensors.
-     *
-     * @note F16 is supported for pool sizes 2 and 3 only
-     *
-     * @param[in, out] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out]     dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPooling
-     *
-     * @note F16 is supported for pool sizes 2 and 3 only
-     *
-     * @param[in] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    std::unique_ptr<INEKernel> _pooling_layer_kernel;
-    std::unique_ptr<INEKernel> _border_handler;
-    std::unique_ptr<INEKernel> _asm_glue;
-
-    bool                             _is_global_pooling_layer;
-    DataLayout                       _data_layout;
-    experimental::MemoryRequirements _mem_req;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOLING_H */
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
new file mode 100644
index 0000000000..527b3a65f9
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+ITensorPack select_activation_src_dst(ITensorPack &tensors)
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
+    pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
+    return pack;
+}
+} // namespace
+
+void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+    // Configure direct convolution kernel
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, weights, biases, dst, conv_info);
+    _direct_conv_kernel = std::move(k);
+
+    // Configure border handler
+    PixelValue zero_value(0.f);
+    if(is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+    }
+    auto b = std::make_unique<CLFillBorderKernel>();
+    b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _src_border_handler = std::move(b);
+
+    if(act_info.enabled())
+    {
+        auto a = std::make_unique<kernels::ClActivationKernel>();
+        a->configure(compile_context, dst, dst, act_info);
+        _activation_kernel = std::move(a);
+    }
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
+}
+
+Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target()));
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
+    }
+    return Status{};
+}
+
+void ClDirectConv2d::run(ITensorPack &tensors)
+{
+    // Run border handler
+    CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
+    // Run direct convolution
+    CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
+    // Run activation kernel
+    if(_activation_kernel)
+    {
+        auto act_pack = select_activation_src_dst(tensors);
+        CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.h b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
new file mode 100644
index 0000000000..e069733fab
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClDirectConv2d
+ */
+class ClDirectConv2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClDirectConv2d() = default;
+    /** Set the src and dst tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of srcs.
+     *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
+    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp b/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp
deleted file mode 100644
index 3382a6c3c5..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace
-{
-ITensorPack select_activation_src_dst(ITensorPack &tensors)
-{
-    ITensorPack pack;
-    pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
-    pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
-    return pack;
-}
-} // namespace
-
-void ClDirectConvolution::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                    const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    // Configure direct convolution kernel
-    auto k = std::make_unique<kernels::ClDirectConvolutionKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, src, weights, biases, dst, conv_info);
-    _direct_conv_kernel = std::move(k);
-
-    // Configure border handler
-    PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(src->data_type()))
-    {
-        zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-    }
-    auto b = std::make_unique<CLFillBorderKernel>();
-    b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
-    _src_border_handler = std::move(b);
-
-    if(act_info.enabled())
-    {
-        auto a = std::make_unique<kernels::ClActivationKernel>();
-        a->configure(compile_context, dst, dst, act_info);
-        _activation_kernel = std::move(a);
-    }
-
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
-}
-
-Status ClDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConvolutionKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target()));
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
-    }
-    return Status{};
-}
-
-void ClDirectConvolution::run(ITensorPack &tensors)
-{
-    // Run border handler
-    CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
-    // Run direct convolution
-    CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
-    // Run activation kernel
-    if(_activation_kernel)
-    {
-        auto act_pack = select_activation_src_dst(tensors);
-        CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
-    }
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.h b/src/runtime/gpu/cl/operators/ClDirectConvolution.h
deleted file mode 100644
index e7ad927b0b..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConvolution.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H
-#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClDirectConvolution
- */
-class ClDirectConvolution : public IClOperator
-{
-public:
-    /** Constructor */
-    ClDirectConvolution() = default;
-    /** Set the src and dst tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of srcs.
-     *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
-     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                             Data types supported: Same as @p src.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolution
-     *
-     * @param[in] src       Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of srcs.
-     *                      Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
-     * @param[in] biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                      Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst       Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                      Data types supported: Same as @p src.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
-    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H */
\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.cpp b/src/runtime/gpu/cl/operators/ClPool2d.cpp
new file mode 100644
index 0000000000..40c2b0a8ba
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClPool2d.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClPool2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::ClPool2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info, indices);
+    _pooling = std::move(k);
+
+    const DataType data_type = src->data_type();
+
+    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+    BorderMode border_mode{};
+    PixelValue pixel_value(0.f);
+    if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
+    {
+        pixel_value = PixelValue(0, data_type, src->quantization_info());
+    }
+
+    // Data layout
+    const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+            border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+            break;
+        case DataLayout::NHWC:
+            border_mode = BorderMode::CONSTANT;
+            if(PoolingType::MAX == info.pool_type)
+            {
+                if(is_data_type_quantized(data_type))
+                {
+                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
+                }
+                else
+                {
+                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+                }
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
+    auto b = std::make_unique<CLFillBorderKernel>();
+    b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
+    _border_handler = std::move(b);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_pooling);
+}
+
+Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+{
+    return kernels::ClPool2dKernel::validate(src, dst, info, indices);
+}
+
+void ClPool2d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
+    CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.h b/src/runtime/gpu/cl/operators/ClPool2d.h
new file mode 100644
index 0000000000..8ac386a64b
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClPool2d.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_H
+#define ARM_COMPUTE_CL_POOL2D_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClPool2d
+ */
+class ClPool2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClPool2d() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]  info            Pooling layer parameters.
+     * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
+     */
+    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<ICLKernel> _pooling{ nullptr };
+    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/src/runtime/gpu/cl/operators/ClPooling.cpp b/src/runtime/gpu/cl/operators/ClPooling.cpp
deleted file mode 100644
index 8610eb9842..0000000000
--- a/src/runtime/gpu/cl/operators/ClPooling.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPooling.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClPoolingKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClPooling::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    // Configure pooling kernel
-    auto k = std::make_unique<kernels::ClPoolingKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, src, dst, info, indices);
-    _pooling = std::move(k);
-
-    const DataType data_type = src->data_type();
-
-    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode{};
-    PixelValue pixel_value(0.f);
-    if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
-    {
-        pixel_value = PixelValue(0, data_type, src->quantization_info());
-    }
-
-    // Data layout
-    const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-            border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-            break;
-        case DataLayout::NHWC:
-            border_mode = BorderMode::CONSTANT;
-            if(PoolingType::MAX == info.pool_type)
-            {
-                if(is_data_type_quantized(data_type))
-                {
-                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
-                }
-                else
-                {
-                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
-                }
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-    auto b = std::make_unique<CLFillBorderKernel>();
-    b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
-    _border_handler = std::move(b);
-
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_pooling);
-}
-
-Status ClPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
-{
-    return kernels::ClPoolingKernel::validate(src, dst, info, indices);
-}
-
-void ClPooling::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
-    CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClPooling.h b/src/runtime/gpu/cl/operators/ClPooling.h
deleted file mode 100644
index 99de6d0dcf..0000000000
--- a/src/runtime/gpu/cl/operators/ClPooling.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_POOLING_H
-#define ARM_COMPUTE_CL_POOLING_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClPooling
- */
-class ClPooling : public IClOperator
-{
-public:
-    /** Constructor */
-    ClPooling() = default;
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]  info            Pooling layer parameters.
-     * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClPooling
-     *
-     * @param[in]  src     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst     Destination tensor info. Data type supported: same as @p src
-     * @param[in]  info    Pooling layer parameters.
-     * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<ICLKernel> _pooling{ nullptr };
-    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_POOLING_H */
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index 5c2ebaa51f..ddf3faacb6 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
@@ -38,7 +38,7 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 // Create function for CpuDepthwiseConvolutionKernel
-using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>;
+using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConv2dNativeKernel>;
 
 // Fixture for NEDepthwiseConvolutionLayerKernel
 template <typename T>
@@ -124,7 +124,7 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
     auto biases  = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
     auto dst     = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
 
-    cpu::kernels::CpuDepthwiseConvolutionNativeKernel dwc;
+    cpu::kernels::CpuDepthwiseConv2dNativeKernel dwc;
     const ConvolutionInfo info{pad_stride_info, 1, ActivationLayerInfo(), Size2D(1, 1)};
     dwc.configure(src.info(), weights.info(), biases.info(), dst.info(), info);
 
-- 
cgit v1.2.1