Rename ported functions

Rename CpuPooling to CpuPool2d Rename CpuPoolingKernel to CpuPool2dKernel Rename CpuPoolingAssemblyWrapperKernel to CpuPool2dAssemblyWrapperKernel Move CpuPool2dAssemblyWrapperKernel in internal subfolder Rename CpuDepthwiseConvolutionNativeKernel to CpuDepthwiseConv2dNativeKernel Rename CpuDepthwiseConvolutionAssemblyDispatch to CpuDepthwiseConv2dAssemblyDispatch Rename CpuDepthwiseConvolution to CpuDepthwiseConv2d Rename CpuDirectConvolutionKernel to CpuDirectConv2dKernel Rename CpuDirectConvolutionOutputStageKernel to CpuDirectConv2dOutputStageKernel Rename CpuDirectConvolution to CpuDirectConv2d Rename ClPoolingKernel to ClPool2dKernel Rename ClPooling to ClPool2d Rename ClDirectConvolutionKernel to ClDirectConv2dKernel Resolves: COMPMID-4405 Change-Id: I8e48f015e4e492a76a7512f5679cb3eb0cd028f6 Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5708 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Manuel Bottini <manuel.bottini@arm.com> 2021-05-24 16:01:32 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-06-01 12:52:48 +0000
commit: b4bb6a03f717a320b935809fde795b3d6ec5a69f (patch)
tree: 9c7913282579995d91e79c1a3486605c28dfa595
parent: 433ea4981675b64c44c8f47f2f4aac6bfcbfc911 (diff)
download: ComputeLibrary-b4bb6a03f717a320b935809fde795b3d6ec5a69f.tar.gz
41 files changed, 747 insertions, 857 deletions
diff --git a/Android.bp b/Android.bp
index 13b70ea0b3..d1003f2d7d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -283,18 +283,17 @@ cc_library_static {
         "src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
         "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
         "src/core/cpu/kernels/CpuCopyKernel.cpp",
-        "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp",
+        "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
         "src/core/cpu/kernels/CpuDequantizeKernel.cpp",
-        "src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp",
-        "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp",
+        "src/core/cpu/kernels/CpuDirectConv2dKernel.cpp",
+        "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp",
         "src/core/cpu/kernels/CpuElementwiseKernel.cpp",
         "src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
         "src/core/cpu/kernels/CpuFillKernel.cpp",
         "src/core/cpu/kernels/CpuFloorKernel.cpp",
         "src/core/cpu/kernels/CpuMulKernel.cpp",
         "src/core/cpu/kernels/CpuPermuteKernel.cpp",
-        "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp",
-        "src/core/cpu/kernels/CpuPoolingKernel.cpp",
+        "src/core/cpu/kernels/CpuPool2dKernel.cpp",
         "src/core/cpu/kernels/CpuQuantizeKernel.cpp",
         "src/core/cpu/kernels/CpuReshapeKernel.cpp",
         "src/core/cpu/kernels/CpuScaleKernel.cpp",
@@ -321,6 +320,7 @@ cc_library_static {
         "src/core/cpu/kernels/add/sve/qsymm16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp32.cpp",
+        "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
         "src/core/cpu/kernels/pooling/neon/fp16.cpp",
         "src/core/cpu/kernels/pooling/neon/fp32.cpp",
         "src/core/cpu/kernels/pooling/neon/nchw/all.cpp",
@@ -348,7 +348,7 @@ cc_library_static {
         "src/core/gpu/cl/kernels/ClCropKernel.cpp",
         "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
         "src/core/gpu/cl/kernels/ClDequantizeKernel.cpp",
-        "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp",
+        "src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp",
         "src/core/gpu/cl/kernels/ClElementwiseKernel.cpp",
         "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
         "src/core/gpu/cl/kernels/ClFillKernel.cpp",
@@ -362,7 +362,7 @@ cc_library_static {
         "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
         "src/core/gpu/cl/kernels/ClMulKernel.cpp",
         "src/core/gpu/cl/kernels/ClPermuteKernel.cpp",
-        "src/core/gpu/cl/kernels/ClPoolingKernel.cpp",
+        "src/core/gpu/cl/kernels/ClPool2dKernel.cpp",
         "src/core/gpu/cl/kernels/ClQuantizeKernel.cpp",
         "src/core/gpu/cl/kernels/ClReshapeKernel.cpp",
         "src/core/gpu/cl/kernels/ClScaleKernel.cpp",
@@ -635,10 +635,10 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuConcatenate.cpp",
         "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp",
         "src/runtime/cpu/operators/CpuCopy.cpp",
-        "src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp",
-        "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp",
+        "src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp",
+        "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp",
         "src/runtime/cpu/operators/CpuDequantize.cpp",
-        "src/runtime/cpu/operators/CpuDirectConvolution.cpp",
+        "src/runtime/cpu/operators/CpuDirectConv2d.cpp",
         "src/runtime/cpu/operators/CpuElementwise.cpp",
         "src/runtime/cpu/operators/CpuElementwiseUnary.cpp",
         "src/runtime/cpu/operators/CpuFill.cpp",
@@ -647,7 +647,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp",
         "src/runtime/cpu/operators/CpuMul.cpp",
         "src/runtime/cpu/operators/CpuPermute.cpp",
-        "src/runtime/cpu/operators/CpuPooling.cpp",
+        "src/runtime/cpu/operators/CpuPool2d.cpp",
         "src/runtime/cpu/operators/CpuQuantize.cpp",
         "src/runtime/cpu/operators/CpuReshape.cpp",
         "src/runtime/cpu/operators/CpuScale.cpp",
@@ -663,7 +663,7 @@ cc_library_static {
         "src/runtime/gpu/cl/operators/ClCopy.cpp",
         "src/runtime/gpu/cl/operators/ClCrop.cpp",
         "src/runtime/gpu/cl/operators/ClDequantize.cpp",
-        "src/runtime/gpu/cl/operators/ClDirectConvolution.cpp",
+        "src/runtime/gpu/cl/operators/ClDirectConv2d.cpp",
         "src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp",
         "src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp",
         "src/runtime/gpu/cl/operators/ClFill.cpp",
@@ -674,7 +674,7 @@ cc_library_static {
         "src/runtime/gpu/cl/operators/ClMul.cpp",
         "src/runtime/gpu/cl/operators/ClPRelu.cpp",
         "src/runtime/gpu/cl/operators/ClPermute.cpp",
-        "src/runtime/gpu/cl/operators/ClPooling.cpp",
+        "src/runtime/gpu/cl/operators/ClPool2d.cpp",
         "src/runtime/gpu/cl/operators/ClQuantize.cpp",
         "src/runtime/gpu/cl/operators/ClReshape.cpp",
         "src/runtime/gpu/cl/operators/ClScale.cpp",
diff --git a/SConscript b/SConscript
index ac15c7fcfb..143823d013 100644
--- a/SConscript
+++ b/SConscript
@@ -300,12 +300,11 @@ if env['neon']:
 
     cpu_kernel_hp_files = ['src/core/cpu/kernels/CpuActivationKernel.cpp',
                            'src/core/cpu/kernels/CpuCastKernel.cpp',
-                           'src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp',
+                           'src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp',
+                           'src/core/cpu/kernels/CpuDirectConv2dKernel.cpp',
+                           'src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp',
                            'src/core/cpu/kernels/CpuPermuteKernel.cpp',
-                           'src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp',
-                           'src/core/cpu/kernels/CpuPoolingKernel.cpp',
+                           'src/core/cpu/kernels/CpuPool2dKernel.cpp',
                            'src/core/cpu/kernels/CpuReshapeKernel.cpp',
                           ]
     cpu_kernel_files = ['src/core/cpu/kernels/CpuAddKernel.cpp',
@@ -352,12 +351,12 @@ if env['neon']:
                    ]
     cpu_operator_hp_files = ['src/runtime/cpu/operators/CpuActivation.cpp',
                              'src/runtime/cpu/operators/CpuCast.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp',
-                             'src/runtime/cpu/operators/CpuDirectConvolution.cpp',
+                             'src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp',
+                             'src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp',
+                             'src/runtime/cpu/operators/CpuDirectConv2d.cpp',
                              'src/runtime/cpu/operators/CpuFlatten.cpp',
                              'src/runtime/cpu/operators/CpuPermute.cpp',
-                             'src/runtime/cpu/operators/CpuPooling.cpp',
+                             'src/runtime/cpu/operators/CpuPool2d.cpp',
                             ]
     cpu_operator_files = ['src/runtime/cpu/operators/CpuAdd.cpp',
                           'src/runtime/cpu/operators/CpuConcatenate.cpp',
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 902feca234..1975e15470 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -36,7 +36,7 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 
-/** Basic function to run  @ref opencl::ClPooling */
+/** Basic function to run  @ref opencl::ClPool2d */
 class CLPoolingLayer : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 82cabed6c9..70352fdfaa 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -39,7 +39,7 @@ class ITensorInfo;
  *
  *  This function calls the following:
  *
- * -# @ref cpu::CpuDirectConvolution
+ * -# @ref cpu::CpuDirectConv2d
  */
 class NEDirectConvolutionLayer : public IFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 1de587b444..b5366fa1c1 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -39,7 +39,7 @@ class ITensorInfo;
 
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
  *
- * -# @ref cpu::CpuPooling
+ * -# @ref cpu::CpuPool2d
  */
 class NEPoolingLayer : public IFunction
 {
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 97929b4eac..649d9343bb 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -108,15 +108,14 @@ def filter_clang_tidy_lines( lines ):
                ("NEWinogradLayerKernel.cpp" in line and "use '= default' to define a trivial destructor" in line) or
                ("NEGEMMLowpMatrixMultiplyCore.cpp" in line and "constructor does not initialize these fields" in line) or
                ("NEGEMMLowpAssemblyMatrixMultiplyCore" in line and "constructor does not initialize these fields" in line) or
-               ("CpuDepthwiseConvolutionNativeKernel" in line and re.search(r"parameter '[^']+' is unused", line)) or
-               ("CpuDepthwiseConvolutionAssemblyDispatch" in line and re.search(r"parameter '[^']+' is unused", line)) or
-               ("CpuDepthwiseConvolutionAssemblyDispatch" in line and "modernize-use-equals-default" in line) or
+               ("CpuDepthwiseConv2dNativeKernel" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("CpuDepthwiseConv2dAssemblyDispatch" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("CpuDepthwiseConv2dAssemblyDispatch" in line and "modernize-use-equals-default" in line) or
                ("CPUUtils.cpp" in line and "consider replacing 'unsigned long' with 'uint64'" in line) or
                ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or
                ("CPUUtils.cpp" in line and "warning: uninitialized record type" in line) or
                ("Utils.h" in line and "warning: Use of zero-allocated memory" in line) or
-               ("CpuDepthwiseConvolutionNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line) or # This is to prevent false positive, should be reassessed with the newer clang-tidy
-               ("CpuDepthwiseConvolutionNativeKernel.cpp" in line and "cppcoreguidelines-pro-type-member-init" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
+               ("CpuDepthwiseConv2dNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
                 print_context=False
                 continue
 
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index cc96cf1a1f..45481d0507 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -53,7 +53,7 @@ public:
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
-     *                             @ref opencl::ClPooling with indices should precede this function in order to
+     *                             @ref CLPoolingLayer with indices should precede this function in order to
      *                             properly reconstruct the output tensor.
      *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
@@ -65,7 +65,7 @@ public:
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
-     *                      @ref opencl::ClPooling with indices should precede this function in order to
+     *                      @ref CLPoolingLayer with indices should precede this function in order to
      *                      properly reconstruct the output tensor.
      *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index f42272826c..ecc116e585 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -56,7 +56,7 @@ public:
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
-     *                       @ref cpu::kernels::CpuPoolingKernel with indices should precede this function in order to
+     *                       @ref NEPoolingLayer with indices should precede this function in order to
      *                       properly reconstruct the output tensor.
      *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index a5d1b61c08..4ddb35f2d5 100644
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/ITensorInfo.h"
@@ -74,7 +74,7 @@ struct DepthwiseConvolutionRunInfo
     const size_t   input_width;
     const size_t   input_depth;
 
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
+    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
         : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
           x_start(w.x().start()),
           x_end(w.x().end()),
@@ -110,14 +110,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u
 }
 
 template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
                                    const Size2D &dilation, const Window &window, bool has_biases)
 {
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
     using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
 
     const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
 
@@ -135,9 +135,9 @@ void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights,
     Window win_output = window;
     win_output.set(Window::DimX, dim_manual_loop);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -224,10 +224,10 @@ void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights,
 }
 
 template <typename T>
-void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
                                const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
 {
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
     Window execution_window = window;
     execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
@@ -246,9 +246,9 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -306,23 +306,24 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
 }
 
 template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
+    ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
     using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
     using AccType                     = int32_t;
     using AccArrayType                = std::array<AccType, element_per_vector>;
 
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
     const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
 
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
     const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
 
     Window execution_window = window;
@@ -339,9 +340,9 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
     Window win_output = window;
     win_output.set(Window::DimX, dim_manual_loop);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -482,18 +483,18 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
 }
 
 template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
     using AccType = int32_t;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
 
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
     const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
 
     Window execution_window = window;
@@ -512,9 +513,9 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -585,8 +586,8 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
 }
 
 template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
     constexpr int half_vec = vector_size / 2;
 
@@ -595,11 +596,11 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITenso
     using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
     using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
+    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
     const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
 
     const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
     const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -624,9 +625,9 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITenso
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -722,16 +723,16 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITenso
     input_it, weights_it, biases_it, output_it);
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > input->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > input->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * info.depth_multiplier) != weights->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
     ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
 
@@ -742,7 +743,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
     if(biases != nullptr)
@@ -750,7 +751,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if(is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -760,36 +761,36 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         }
     }
 
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     return Status{};
 }
 } // namespace
 
-CpuDepthwiseConvolutionNativeKernel::CpuDepthwiseConvolutionNativeKernel()
+CpuDepthwiseConv2dNativeKernel::CpuDepthwiseConv2dNativeKernel()
     : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
 {
 }
 
-void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, weights, (biases != nullptr) ? biases : nullptr, output, info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
 
     _conv_info        = info.pad_stride_info;
     _depth_multiplier = info.depth_multiplier;
     _dilation         = info.dilation;
     _has_biases       = (biases != nullptr);
 
-    if(is_data_type_quantized(input->data_type()))
+    if(is_data_type_quantized(src->data_type()))
     {
-        const auto input_scale  = input->quantization_info().uniform().scale;
-        const auto output_scale = output->quantization_info().uniform().scale;
+        const auto input_scale  = src->quantization_info().uniform().scale;
+        const auto output_scale = dst->quantization_info().uniform().scale;
 
         auto weights_scale = weights->quantization_info().scale();
         if(!is_data_type_quantized_per_channel(weights->data_type()))
@@ -815,50 +816,50 @@ void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, co
     switch(weights->data_type())
     {
         case DataType::QASYMM8:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
             break;
         case DataType::QASYMM8_SIGNED:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
             break;
         case DataType::QSYMM8_PER_CHANNEL:
-            if(input->data_type() == DataType::QASYMM8)
+            if(src->data_type() == DataType::QASYMM8)
             {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
             }
             else
             {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
             }
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;
     }
 
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
 
-    Window win = calculate_max_window(*output, Steps());
+    Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
 }
 
-Status CpuDepthwiseConvolutionNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
     return Status{};
 }
 
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
@@ -873,9 +874,9 @@ void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, cons
     }
 }
 
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
@@ -900,7 +901,7 @@ void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, cons
     }
 }
 
-void CpuDepthwiseConvolutionNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 242536d441..559c46dc93 100644
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/common/Macros.h"
@@ -40,46 +40,38 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConvolutionNativeKernel : public ICpuKernel
+class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
 {
 public:
     const char *name() const override
     {
-        return "CpuDepthwiseConvolutionNativeKernel";
+        return "CpuDepthwiseConv2dNativeKernel";
     }
     /** Default constructor */
-    CpuDepthwiseConvolutionNativeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConvolutionNativeKernel);
+    CpuDepthwiseConv2dNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
 
     /** Initialize the function's source, destination and parameters.
      *
      * @note Supported data layouts: NHWC
      *
-     * @param[in]  input   Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  src     Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                     Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     *                     Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
+     *                     Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst     Destination tensor. Data type supported: Same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      *
      */
-    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionNativeKernel
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in] input   Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] info    Depthwise convolution meta-data.
+     * Similar to CpuDepthwiseConv2dNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -101,7 +93,7 @@ private:
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using DepthwiseFunctionPtr = void (CpuDepthwiseConvolutionNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+    using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
 
     DepthwiseFunctionPtr _func;
     PadStrideInfo        _conv_info;
@@ -114,4 +106,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H */
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
index 4f46eb2bf6..c0fc41525e 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
 
 #include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -995,7 +995,7 @@ bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights)
 } // namespace
 
 template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
 {
     // This function assumes that input and weights have not padding in channel
 
@@ -1116,7 +1116,7 @@ void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, c
 }
 
 template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
 {
     // Declare useful types
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -1219,12 +1219,12 @@ void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITens
     out);
 }
 
-BorderSize CpuDirectConvolutionKernel::border_size() const
+BorderSize CpuDirectConv2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -1263,7 +1263,7 @@ void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weight
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     unsigned int num_weight_elems_read_per_row   = 0;
     unsigned int num_elems_read_per_iteration    = 0;
@@ -1283,7 +1283,7 @@ Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITenso
     return Status{};
 }
 
-void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -1376,7 +1376,7 @@ void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &wind
         }
     }
 }
-const char *CpuDirectConvolutionKernel::name() const
+const char *CpuDirectConv2dKernel::name() const
 {
     return "CpuDirectConvolutionLayerKernel";
 }
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
index fb8218394b..62ed96f255 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/core/cpu/ICpuKernel.h"
@@ -35,13 +35,13 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConvolutionKernel : public ICpuKernel
+class CpuDirectConv2dKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuDirectConvolutionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionKernel);
-    /** Set the input, weights, and output tensors.
+    CpuDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
+    /** Set the src, weights, and dst tensors.
      *
      * @note: DirectConvolution only works in the following configurations:
      *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
@@ -57,16 +57,9 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] dst       Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * Similar to CpuDirectConv2dKernel::configure()
      *
      * @return a status
      */
@@ -97,4 +90,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index 5f7a574e5a..662d052941 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -384,8 +384,8 @@ void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window,
 }
 } // namespace
 
-void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                      const DirectConvolutionLayerOutputStageKernelInfo &info)
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                 const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validation step
@@ -483,14 +483,14 @@ void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const IT
     }
 }
 
-Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                       const DirectConvolutionLayerOutputStageKernelInfo &info)
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
     return Status{};
 }
 
-void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -504,9 +504,9 @@ void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const W
     (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
 }
 
-const char *CpuDirectConvolutionOutputStageKernel::name() const
+const char *CpuDirectConv2dOutputStageKernel::name() const
 {
-    return "CpuDirectConvolutionOutputStageKernel";
+    return "CpuDirectConv2dOutputStageKernel";
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index 9eeab194cb..62bc5d41c9 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/common/Macros.h"
@@ -41,33 +41,27 @@ namespace kernels
  * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
  *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
  */
-class CpuDirectConvolutionOutputStageKernel : public ICpuKernel
+class CpuDirectConv2dOutputStageKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuDirectConvolutionOutputStageKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionOutputStageKernel);
+    CpuDirectConv2dOutputStageKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
     /** Set the accumulate buffer and the biases of the kernel.
      *
-     * @param[in, out] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     * @param[in, out] src  Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
      *                      Data type supported: F16/F32/S32
      * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[out]     dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     * @param[out]     dst  (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
      *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
      *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
      * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
      */
     void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
                    const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionOutputStageKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                 Data type supported: F16/F32/S32
-     * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[in] dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                 Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                 Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
-     * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+     * Similar to CpuDirectConv2dOutputStageKernel::configure()
      *
      * @return a status
      */
@@ -90,4 +84,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
index a55f60d7ad..e6f5890685 100644
--- a/src/core/cpu/kernels/CpuPoolingKernel.cpp
+++ b/src/core/cpu/kernels/CpuPool2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuPoolingKernel.h"
+#include "src/core/cpu/kernels/CpuPool2dKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -374,12 +374,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
 }
 } // namespace
 
-BorderSize CpuPoolingKernel::border_size() const
+BorderSize CpuPool2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
@@ -420,7 +420,7 @@ void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Pooli
     }
 }
 
-Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -446,7 +446,7 @@ Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst
     return Status{};
 }
 
-void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -505,9 +505,9 @@ void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const
     uk->ukernel(src, dst, indices, _pool_info, window_src, window);
 }
 
-const char *CpuPoolingKernel::name() const
+const char *CpuPool2dKernel::name() const
 {
-    return "CpuPoolingKernel";
+    return "CpuPool2dKernel";
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
index 87d8f67119..95298004e9 100644
--- a/src/core/cpu/kernels/CpuPoolingKernel.h
+++ b/src/core/cpu/kernels/CpuPool2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_POOLING_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
 #include "src/core/common/Macros.h"
@@ -35,12 +35,12 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the pooling layer kernel */
-class CpuPoolingKernel : public ICpuKernel
+class CpuPool2dKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuPoolingKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPoolingKernel);
+    CpuPool2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
     /** Configure kernel for a given list of arguments
      *
      * @note F16 are supported for pool sizes 2 and 3 only
@@ -51,14 +51,9 @@ public:
      * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
     void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPoolingKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: Same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * Similar to CpuPool2dKernel::configure()
      *
      * @return a status
      */
@@ -80,4 +75,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOLING_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index ccf73883f0..c78ffb9848 100644
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
+#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -41,7 +41,7 @@ namespace kernels
 {
 using namespace arm_compute::misc::shape_calculator;
 
-void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -88,7 +88,7 @@ void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorI
     INEKernel::configure(win);
 }
 
-Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -136,7 +136,7 @@ Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const I
     return Status{};
 }
 
-void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -170,18 +170,18 @@ void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window
                          working_space, info.thread_id, info.num_threads);
 }
 
-size_t CpuPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
 {
     return _kernel_asm->get_working_size(num_threads);
 }
 
-bool CpuPoolingAssemblyWrapperKernel::is_configured() const
+bool CpuPool2dAssemblyWrapperKernel::is_configured() const
 {
     return _kernel_asm != nullptr;
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
 
@@ -220,7 +220,7 @@ void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
 
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 34ec452deb..3afa4c16a4 100644
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
 #include "src/core/NEON/kernels/assembly/pooling.hpp"
@@ -41,23 +41,21 @@ namespace kernels
   *
   * Some kernels were written in assembly and highly optimised for specific
   * CPUs like A53 or A55. The arm compute library creates an instance of
-  * CpuPoolingAssemblyWrapperKernel and other auxiliary data structures to
+  * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
   * execute a single assembly kernel in the context of an NEFunction.
   *
   */
-class CpuPoolingAssemblyWrapperKernel final : public ICpuKernel
+class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
 {
 public:
     /** Constructor
      */
-    CpuPoolingAssemblyWrapperKernel()                                   = default;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &)  = delete;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &&) = default;
-    CpuPoolingAssemblyWrapperKernel &operator=(CpuPoolingAssemblyWrapperKernel &) = delete;
+    CpuPool2dAssemblyWrapperKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
 
     const char *name() const override
     {
-        return "CpuPoolingAssemblyWrapperKernel";
+        return "CpuPool2dAssemblyWrapperKernel";
     }
 
     /** Initialise the kernel's src and dst.
@@ -69,13 +67,11 @@ public:
      */
     void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
 
-    /** Indicates whether or not this function can be used to process the given parameters.
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst  Destination tensor to store the result of pooling. Data types supported: same as @p src.
-     * @param[in] info Pooling meta-data
+     * Similar to CpuPool2dAssemblyWrapperKernel::configure()
      *
-     * @return a status.
+     * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
 
@@ -120,4 +116,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H */
+#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 0a5101f564..2c9a4f301b 100644
--- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h"
+#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -369,13 +369,13 @@ bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataL
 
 } // namespace
 
-BorderSize ClDirectConvolutionKernel::border_size() const
+BorderSize ClDirectConv2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                          const PadStrideInfo &conv_info)
+void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+                                     const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -564,8 +564,8 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                           const GPUTarget target)
+Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                                      const GPUTarget target)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first);
@@ -573,7 +573,7 @@ Status ClDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensor
     return Status{};
 }
 
-void ClDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
index 384b561003..ec76624e5c 100644
--- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H
-#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/core/gpu/cl/ClCompileContext.h"
@@ -36,11 +36,11 @@ namespace kernels
 {
 /** Interface for the  direct convolution kernel.
  */
-class ClDirectConvolutionKernel : public IClKernel
+class ClDirectConv2dKernel : public IClKernel
 {
 public:
-    ClDirectConvolutionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConvolutionKernel);
+    ClDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
     /** Set the src, weights, biases and dst tensors info.
      *
      * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
@@ -64,19 +64,9 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolutionKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the src's volume 3rd dimension.
-     *                      Data type supported:Same as @p src.
-     * @param[in] biases    Biases tensor info. Biases are 1D tensor with dimension [OFM].
-     *                      Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst       Output tensor info.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] target    Target GPU architecture.
+     * Similar to ClDirectConv2dKernel::configure()
      *
      * @return a status
      */
@@ -94,4 +84,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H */
+#endif /*ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
index 08a3ce3784..0e15bffd14 100644
--- a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/gpu/cl/kernels/ClPoolingKernel.h"
+#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -202,17 +202,17 @@ std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITenso
 }
 } // namespace
 
-ClPoolingKernel::ClPoolingKernel()
+ClPool2dKernel::ClPool2dKernel()
     : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
 {
 }
 
-BorderSize ClPoolingKernel::border_size() const
+BorderSize ClPool2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void ClPoolingKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -422,7 +422,7 @@ void ClPoolingKernel::configure(const ClCompileContext &compile_context, ITensor
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
-Status ClPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info)));
@@ -430,7 +430,7 @@ Status ClPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst,
     return Status{};
 }
 
-void ClPoolingKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.h b/src/core/gpu/cl/kernels/ClPool2dKernel.h
index c1ce859e2c..8ecb8eb7b7 100644
--- a/src/core/gpu/cl/kernels/ClPoolingKernel.h
+++ b/src/core/gpu/cl/kernels/ClPool2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_POOLING_KERNEL_H
-#define ARM_COMPUTE_CL_POOLING_KERNEL_H
+#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CL_POOL2D_KERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/core/gpu/cl/ClCompileContext.h"
@@ -35,12 +35,12 @@ namespace opencl
 namespace kernels
 {
 /** Interface for the pooling layer kernel */
-class ClPoolingKernel : public IClKernel
+class ClPool2dKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClPoolingKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPoolingKernel);
+    ClPool2dKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel);
 
     /** Configure kernel for a given list of arguments
      *
@@ -52,12 +52,9 @@ public:
      * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
     void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClPoolingKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * Similar to ClPool2dKernel::configure()
      *
      * @return a status
      */
@@ -76,4 +73,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_POOLING_KERNEL_H */
+#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 74867ff64f..907e69d8d7 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -29,17 +29,17 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/runtime/gpu/cl/operators/ClActivation.h"
-#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h"
+#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLDirectConvolutionLayer::Impl
 {
-    const ICLTensor                             *src{ nullptr };
-    const ICLTensor                             *weights{ nullptr };
-    const ICLTensor                             *biases{ nullptr };
-    ICLTensor                                   *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConvolution> op{ nullptr };
+    const ICLTensor                        *src{ nullptr };
+    const ICLTensor                        *weights{ nullptr };
+    const ICLTensor                        *biases{ nullptr };
+    ICLTensor                              *dst{ nullptr };
+    std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
 };
 
 CLDirectConvolutionLayer::CLDirectConvolutionLayer()
@@ -65,14 +65,14 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
     _impl->biases  = biases;
     _impl->dst     = output;
 
-    _impl->op = std::make_unique<opencl::ClDirectConvolution>();
+    _impl->op = std::make_unique<opencl::ClDirectConv2d>();
     _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
 Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    return opencl::ClDirectConvolution::validate(input, weights, biases, output, conv_info, act_info);
+    return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
 }
 
 void CLDirectConvolutionLayer::run()
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index fbaec1d2d9..7ba911c342 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -26,16 +26,16 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPooling.h"
+#include "src/runtime/gpu/cl/operators/ClPool2d.h"
 
 namespace arm_compute
 {
 struct CLPoolingLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    ICLTensor                         *indices{ nullptr };
-    std::unique_ptr<opencl::ClPooling> op{ nullptr };
+    const ICLTensor                  *src{ nullptr };
+    ICLTensor                        *dst{ nullptr };
+    ICLTensor                        *indices{ nullptr };
+    std::unique_ptr<opencl::ClPool2d> op{ nullptr };
 };
 
 CLPoolingLayer::CLPoolingLayer()
@@ -55,13 +55,13 @@ void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTenso
     _impl->dst     = output;
     _impl->indices = indices;
 
-    _impl->op = std::make_unique<opencl::ClPooling>();
+    _impl->op = std::make_unique<opencl::ClPool2d>();
     _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return opencl::ClPooling::validate(input, output, pool_info, indices);
+    return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }
 
 void CLPoolingLayer::run()
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index da9610ef42..a561b88058 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
@@ -47,15 +47,15 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
     const ITensor *biases
     {
         nullptr
-    };                                                                // SRC_2
-    Tensor                                        permuted_input{};   // INT_0
-    Tensor                                        permuted_weights{}; // INT_1
-    Tensor                                        permuted_output{};  // INT_2
-    Tensor                                        workspace{};        // INT_3
-    Tensor                                        packed_weights{};   // INT_4
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
-    bool                                          is_prepared{ false };
-    bool                                          permute{ false };
+    };                                                           // SRC_2
+    Tensor                                   permuted_input{};   // INT_0
+    Tensor                                   permuted_weights{}; // INT_1
+    Tensor                                   permuted_output{};  // INT_2
+    Tensor                                   workspace{};        // INT_3
+    Tensor                                   packed_weights{};   // INT_4
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+    bool                                     is_prepared{ false };
+    bool                                     permute{ false };
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
@@ -80,7 +80,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     _impl->dst     = output;
     _impl->permute = is_nhwc;
 
-    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
                          _impl->dst->info(), info);
@@ -97,7 +97,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     }
     info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
 
-    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConvolutionAssemblyDispatch>();
+    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
     if(is_nhwc)
     {
@@ -154,7 +154,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
                                                                                            const Size2D              &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
@@ -197,17 +197,17 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
 
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
 {
-    Tensor                                        permuted_input{};
-    Tensor                                        permuted_weights{};
-    Tensor                                        permuted_output{};
-    bool                                          is_prepared{ false };
-    bool                                          is_nchw{ false };
-    bool                                          is_activationlayer_enabled{ false };
-    const ITensor                                *weights{ nullptr };
-    const ITensor                                *biases{ nullptr };
-    const ITensor                                *src{ nullptr };
-    ITensor                                      *dst{ nullptr };
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+    Tensor                                   permuted_input{};
+    Tensor                                   permuted_weights{};
+    Tensor                                   permuted_output{};
+    bool                                     is_prepared{ false };
+    bool                                     is_nchw{ false };
+    bool                                     is_activationlayer_enabled{ false };
+    const ITensor                           *weights{ nullptr };
+    const ITensor                           *biases{ nullptr };
+    const ITensor                           *src{ nullptr };
+    ITensor                                 *dst{ nullptr };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -223,7 +223,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
 
     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
 
     _impl->src         = input;
@@ -253,7 +253,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
         output_to_use = &_impl->permuted_output;
     }
 
-    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
     if(_impl->is_nchw)
@@ -273,7 +273,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate
                                                                                  unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
@@ -298,10 +298,10 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 #ifndef DOXYGEN_SKIP_THIS
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    DepthwiseConvolutionFunction                  depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
-    NEDepthwiseConvolutionLayerOptimizedInternal  func_optimized{ nullptr };
-    NEDepthwiseConvolutionLayerGeneric            func_generic{};
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+    DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+    NEDepthwiseConvolutionLayerGeneric           func_generic{};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
 };
 #endif // DOXYGEN_SKIP_THIS
 
@@ -309,7 +309,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
                                             const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op              = std::make_shared<cpu::CpuDepthwiseConvolution>();
+    _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
     _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
                                                                           info);
     switch(_impl->depth_conv_func)
@@ -329,7 +329,7 @@ Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
                                              unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 73834381c6..58530e4a8f 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,17 +27,17 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
+#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
 struct NEDirectConvolutionLayer::Impl
 {
-    ITensor                                   *src{ nullptr };
-    const ITensor                             *weights{ nullptr };
-    const ITensor                             *bias{ nullptr };
-    ITensor                                   *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDirectConvolution> op{ nullptr };
+    ITensor                              *src{ nullptr };
+    const ITensor                        *weights{ nullptr };
+    const ITensor                        *bias{ nullptr };
+    ITensor                              *dst{ nullptr };
+    std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
 };
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -52,14 +52,14 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
     _impl->weights = weights;
     _impl->bias    = bias;
     _impl->dst     = output;
-    _impl->op      = std::make_unique<cpu::CpuDirectConvolution>(_memory_manager);
+    _impl->op      = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
     _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
 }
 
 Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    return cpu::CpuDirectConvolution::validate(input, weights, bias, output, conv_info, act_info);
+    return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
 }
 
 void NEDirectConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 1570cdeedc..bbf3e7cc4e 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -26,17 +26,17 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuPooling.h"
+#include "src/runtime/cpu/operators/CpuPool2d.h"
 
 namespace arm_compute
 {
 struct NEPoolingLayer::Impl
 {
-    ITensor                         *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    ITensor                         *indices{ nullptr };
-    Tensor                           workspace{ nullptr };
-    std::unique_ptr<cpu::CpuPooling> op{ nullptr };
+    ITensor                        *src{ nullptr };
+    ITensor                        *dst{ nullptr };
+    ITensor                        *indices{ nullptr };
+    Tensor                          workspace{ nullptr };
+    std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
 };
 
 NEPoolingLayer::~NEPoolingLayer() = default;
@@ -51,7 +51,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     _impl->src     = input;
     _impl->dst     = output;
     _impl->indices = indices;
-    _impl->op      = std::make_unique<cpu::CpuPooling>();
+    _impl->op      = std::make_unique<cpu::CpuPool2d>();
     _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 
     // Allocate workspace based on kernel's memory requirements
@@ -66,7 +66,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return cpu::CpuPooling::validate(input, output, pool_info, indices);
+    return cpu::CpuPool2d::validate(input, output, pool_info, indices);
 }
 
 void NEPoolingLayer::run()
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
index 6d097280e0..160a9fd70b 100644
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
 namespace arm_compute
 {
@@ -36,61 +36,61 @@ namespace cpu
 {
 namespace
 {
-Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     if(!is_data_type_quantized_per_channel(weights->data_type()))
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > input->dimension(idx_w) + info.pad_stride_info.pad_left() +
+    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
                                 info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > input->dimension(idx_h) + info.pad_stride_info.pad_top() +
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
                                 info.pad_stride_info.pad_bottom());
 
     if(biases != nullptr)
     {
-        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
 
     //Validate Activation Layer
     if(info.act_info.enabled())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
     return Status{};
 }
 } // namespace
 
-CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::CpuDepthwiseConvolutionOptimizedInternal()
+CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::CpuDepthwiseConv2dOptimizedInternal()
     : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
       _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configure(ITensorInfo           *input,
-                                                                                  const ITensorInfo     *weights,
-                                                                                  const ITensorInfo     *biases,
-                                                                                  ITensorInfo           *output,
-                                                                                  const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo           *src,
+                                                                        const ITensorInfo     *weights,
+                                                                        const ITensorInfo     *biases,
+                                                                        ITensorInfo           *dst,
+                                                                        const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, (biases == nullptr) ? nullptr : biases,
-                                                                                  output, info));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+                                                                             dst, info));
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
     _has_bias     = biases != nullptr;
-    _is_nchw      = input->data_layout() == DataLayout::NCHW;
+    _is_nchw      = src->data_layout() == DataLayout::NCHW;
     _permute      = _is_nchw;
     _is_prepared  = false;
 
@@ -105,7 +105,7 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configur
         act_info_to_use = info.act_info;
     }
 
-    _dwc_optimized_func = std::make_unique<CpuDepthwiseConvolutionAssemblyDispatch>();
+    _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
     if(_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
@@ -117,7 +117,7 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configur
         auto output_perm  = std::make_unique<TensorInfo>();
 
         // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
         input_perm->set_data_layout(DataLayout::NHWC);
 
         // Configure the function to transform the weights tensor from IHW -> HWI
@@ -125,38 +125,38 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configur
         weights_perm->set_data_layout(DataLayout::NHWC);
 
         output_perm->set_data_layout(DataLayout::NHWC);
-        output_perm->set_quantization_info(output->quantization_info());
+        output_perm->set_quantization_info(dst->quantization_info());
 
         // Configure optimized depthwise
         _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
 
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         output_perm->set_data_layout(DataLayout::NHWC);
-        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
     }
     else
     {
-        _dwc_optimized_func->configure(input, weights, biases, output, info);
+        _dwc_optimized_func->configure(src, weights, biases, dst, info);
     }
 
     // Configure activation
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(output, nullptr, info.act_info);
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
     }
 }
 
-Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::validate(const ITensorInfo     *input,
-                                                                                   const ITensorInfo     *weights,
-                                                                                   const ITensorInfo     *biases,
-                                                                                   const ITensorInfo     *output,
-                                                                                   const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo     *src,
+                                                                         const ITensorInfo     *weights,
+                                                                         const ITensorInfo     *biases,
+                                                                         const ITensorInfo     *dst,
+                                                                         const ConvolutionInfo &info)
 {
-    return validate_arguments_optimized(input, weights, biases, output, info);
+    return validate_arguments_optimized(src, weights, biases, dst, info);
 }
 
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::run(ITensorPack &tensors)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
     prepare(tensors);
@@ -229,7 +229,7 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::run(ITen
     }
 }
 
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::prepare(ITensorPack &tensors)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
 {
     if(!_is_prepared)
     {
@@ -272,35 +272,35 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::prepare(
     }
 }
 
-CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::CpuDepthwiseConvolutionGeneric()
+CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::CpuDepthwiseConv2dGeneric()
     : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
       _is_activationlayer_enabled(false)
 {
 }
 
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolution::validate(input, weights, (biases == nullptr) ? nullptr : biases,
-                                                                 output, info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+                                                            dst, info));
 
-    _is_nchw     = input->data_layout() == DataLayout::NCHW;
+    _is_nchw     = src->data_layout() == DataLayout::NCHW;
     _is_prepared = !_is_nchw;
 
-    ITensorInfo       *input_to_use   = input;
+    ITensorInfo       *input_to_use   = src;
     const ITensorInfo *weights_to_use = weights;
-    ITensorInfo       *output_to_use  = output;
+    ITensorInfo       *output_to_use  = dst;
 
     auto input_perm   = std::make_unique<TensorInfo>();
     auto weights_perm = std::make_unique<TensorInfo>();
-    auto output_perm  = std::make_unique<TensorInfo>(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+    auto output_perm  = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
 
     if(_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
         _permute_weights = std::make_unique<cpu::CpuPermute>();
 
-        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
         input_perm->set_data_layout(DataLayout::NHWC);
         input_to_use = input_perm.get();
 
@@ -311,13 +311,13 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorI
         output_to_use = output_perm.get();
     }
 
-    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
 
     if(_is_nchw)
     {
         _permute_output = std::make_unique<cpu::CpuPermute>();
-        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
         output_perm->set_data_layout(DataLayout::NHWC);
     }
 
@@ -326,48 +326,48 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorI
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(output, nullptr, info.act_info);
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
     }
 }
 
-Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                         const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                               const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    if(input->data_layout() == DataLayout::NCHW)
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    if(src->data_layout() == DataLayout::NCHW)
     {
-        TensorShape permuted_input_shape   = input->tensor_shape();
+        TensorShape permuted_input_shape   = src->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_input   = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
         const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+        const TensorInfo permuted_output  = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(input, weights, biases, output, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
     }
 
     // Validate Activation Layer
     if(info.act_info.enabled())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
 
     return Status{};
 }
 
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::run(ITensorPack &tensors)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
 {
     auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
@@ -421,7 +421,7 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::run(ITensorPack &t
     }
 }
 
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::prepare(ITensorPack &tensors)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
 {
     if(!_is_prepared)
     {
@@ -440,47 +440,47 @@ void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::prepare(ITensorPac
     }
 }
 
-CpuDepthwiseConvolution::CpuDepthwiseConvolution()
+CpuDepthwiseConv2d::CpuDepthwiseConv2d()
     : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
 {
 }
 
-void CpuDepthwiseConvolution::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    _depth_conv_func = get_depthwiseconvolution_function(input, weights, (biases != nullptr) ? biases : nullptr, output, info);
+    _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
     switch(_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.configure(input, weights, biases, output, info);
+            _func_optimized.configure(src, weights, biases, dst, info);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.configure(input, weights, biases, output, info);
+            _func_generic.configure(src, weights, biases, dst, info);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
-Status CpuDepthwiseConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, info);
+    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
     switch(depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            return CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info);
+            return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            return CpuDepthwiseConvolutionGeneric::validate(input, weights, biases, output, info);
+            return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
-DepthwiseConvolutionFunction CpuDepthwiseConvolution::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                        const ConvolutionInfo &info)
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                                                   const ConvolutionInfo &info)
 {
-    if(bool(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info)))
+    if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
     {
         return DepthwiseConvolutionFunction::OPTIMIZED;
     }
@@ -490,7 +490,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConvolution::get_depthwiseconvolution_f
     }
 }
 
-void CpuDepthwiseConvolution::run(ITensorPack &tensors)
+void CpuDepthwiseConv2d::run(ITensorPack &tensors)
 {
     switch(_depth_conv_func)
     {
@@ -505,7 +505,7 @@ void CpuDepthwiseConvolution::run(ITensorPack &tensors)
     }
 }
 
-void CpuDepthwiseConvolution::prepare(ITensorPack &tensors)
+void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
 {
     switch(_depth_conv_func)
     {
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
new file mode 100644
index 0000000000..049397fe60
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to execute a depthwise convolution.
+ */
+class CpuDepthwiseConv2d : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2d();
+    /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
+     *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      info    Depthwise convolution meta-data.
+     */
+    void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
+     *
+     * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                    Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                    Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] dst     Destination tensor. Data type supported: same as @p src.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return a Depthwise Convolution Function
+     */
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                                          const ConvolutionInfo &info);
+
+    // Inherited methods overriden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+
+private:
+    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
+    *
+    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+    *
+    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+    * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
+    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required
+    * -# @ref NEActivationLayer if fused activation is required
+    *
+    */
+    class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dOptimizedInternal();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dOptimizedInternal() = default;
+        /** Initialize the function's source, destination, kernels and border_size.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+        // Inherited methods overriden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                      _activationlayer_function{ nullptr };
+        bool                                                _has_bias{ false };
+        bool                                                _is_quantized{ false };
+        bool                                                _is_nchw{ true };
+        bool                                                _permute{ false };
+        bool                                                _is_activationlayer_enabled{ false };
+        bool                                                _is_prepared{ false };
+    };
+
+    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
+     *
+     * -# @ref CpuDepthwiseConv2dNativeKernel
+     *
+     */
+    class CpuDepthwiseConv2dGeneric : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dGeneric();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dGeneric() = default;
+        /** Initialize the function's source, destination, weights and convolution information.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dGeneric::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+        // Inherited methods overridden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
+        bool                                                     _is_nchw{ true };
+        bool                                                     _is_prepared{ false };
+        bool                                                     _is_activationlayer_enabled{ false };
+    };
+
+    DepthwiseConvolutionFunction        _depth_conv_func;
+    CpuDepthwiseConv2dOptimizedInternal _func_optimized;
+    CpuDepthwiseConv2dGeneric           _func_generic;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index 039714abb1..a36ee1d45b 100644
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
@@ -211,13 +211,13 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_
     }
 }
 
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *input,
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *src,
                                                                    const ITensorInfo     *weights,
                                                                    ITensorInfo           *output,
                                                                    const ConvolutionInfo &info)
 {
-    const DataType    data_type = input->data_type();
-    const TensorShape shape     = input->tensor_shape();
+    const DataType    data_type = src->data_type();
+    const TensorShape shape     = src->tensor_shape();
 
     const int n_batches       = shape[3];
     const int in_rows         = shape.z();
@@ -249,7 +249,7 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
     // Create quantized convolver
     if(is_uniform_quantized)
     {
-        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
+        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
         const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
         const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
 
@@ -273,7 +273,7 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
     }
     else if(is_perchannel_quantized)
     {
-        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
+        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
         const QuantizationInfo        weights_qinfo = weights->quantization_info();
         const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
 
@@ -327,7 +327,7 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
 }
 } // namespace
 
-struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl
+struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
 {
     std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
     NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
@@ -336,36 +336,36 @@ struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl
 };
 
 #ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch()
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
     : _pImpl(std::make_unique<LocalImpl>())
 {
 }
 #endif /* DOXYGEN_SKIP_THIS */
 
-CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch() = default;
+CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default;
 
-void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo     *input,
-                                                        const ITensorInfo     *weights,
-                                                        const ITensorInfo     *bias,
-                                                        ITensorInfo           *output,
-                                                        const ConvolutionInfo &info)
+void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
+                                                   const ITensorInfo     *weights,
+                                                   const ITensorInfo     *bias,
+                                                   ITensorInfo           *dst,
+                                                   const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_UNUSED(bias);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionAssemblyDispatch::validate(input,
-                                                                                 weights,
-                                                                                 bias != nullptr ? bias : nullptr,
-                                                                                 output,
-                                                                                 info));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src,
+                                                                            weights,
+                                                                            bias != nullptr ? bias : nullptr,
+                                                                            dst,
+                                                                            info));
 
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
+    const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info()));
 
     _pImpl->is_prepared = false;
 
     // Create convolver
-    _pImpl->dwc_assembly_kernel = create_convolver(input, weights, output, info);
+    _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info);
     ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
 
     // Create assembly kernel wrapper
@@ -386,27 +386,27 @@ void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo     *i
     _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
 }
 
-experimental::MemoryRequirements CpuDepthwiseConvolutionAssemblyDispatch::workspace() const
+experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
 {
     return _pImpl->mem_req;
 }
 
-Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo     *input,
-                                                         const ITensorInfo     *weights,
-                                                         const ITensorInfo     *bias,
-                                                         const ITensorInfo     *output,
-                                                         const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
+                                                    const ITensorInfo     *weights,
+                                                    const ITensorInfo     *bias,
+                                                    const ITensorInfo     *dst,
+                                                    const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
 
     // Validate convolver
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, info));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info));
 
     // Validate activation
     const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
@@ -416,50 +416,50 @@ Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo     *
     // Check bias
     if(bias != nullptr)
     {
-        unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
     }
 
     // Check output
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     // The uniform quantization case will only have 1 scale value in the weights quantization info
-    const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
+    const UniformQuantizationInfo src_qinfo     = src->quantization_info().uniform();
     const QuantizationInfo        weights_qinfo = weights->quantization_info();
-    const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo     = dst->quantization_info().uniform();
     for(auto const s : weights_qinfo.scale())
     {
-        const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
+        const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale;
         ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
     }
 
     return Status{};
 }
 
-bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo     *input,
-                                                                     const ITensorInfo     *weights,
-                                                                     const ConvolutionInfo &info)
+bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo     *src,
+                                                                const ITensorInfo     *weights,
+                                                                const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
 
     // Reshape input shape if in NHWC format
-    const DataLayout data_layout = input->data_layout();
-    TensorShape      in_shape{ input->tensor_shape() };
+    const DataLayout data_layout = src->data_layout();
+    TensorShape      in_shape{ src->tensor_shape() };
     if(data_layout == DataLayout::NHWC)
     {
-        in_shape.set(Window::DimX, input->tensor_shape().y());
-        in_shape.set(Window::DimY, input->tensor_shape().z());
-        in_shape.set(Window::DimZ, input->tensor_shape().x());
+        in_shape.set(Window::DimX, src->tensor_shape().y());
+        in_shape.set(Window::DimY, src->tensor_shape().z());
+        in_shape.set(Window::DimZ, src->tensor_shape().x());
     }
 
     // Check data type
-    const DataType input_type            = input->data_type();
+    const DataType input_type            = src->data_type();
     const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
     const DataType weights_type          = weights->data_type();
     const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
@@ -497,7 +497,7 @@ bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITens
     return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
 }
 
-void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors)
+void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
 {
     // Prepare assembly kernel
     prepare(tensors);
@@ -530,7 +530,7 @@ void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors)
     NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
 }
 
-void CpuDepthwiseConvolutionAssemblyDispatch::prepare(ITensorPack &tensors)
+void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
 {
     if(!_pImpl->is_prepared)
     {
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
index 6aac74c3ef..195942b7fd 100644
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
 
+#include "src/core/common/Macros.h"
 #include "src/runtime/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -31,57 +32,45 @@ namespace arm_compute
 namespace cpu
 {
 /** Depthwise convolution assembly kernel glue */
-class CpuDepthwiseConvolutionAssemblyDispatch : public ICpuOperator
+class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator
 {
 public:
-    CpuDepthwiseConvolutionAssemblyDispatch();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuDepthwiseConvolutionAssemblyDispatch(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    CpuDepthwiseConvolutionAssemblyDispatch(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuDepthwiseConvolutionAssemblyDispatch &operator=(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move assignment operator */
-    CpuDepthwiseConvolutionAssemblyDispatch &operator=(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
+    /** Default constructor */
+    CpuDepthwiseConv2dAssemblyDispatch();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
     /** Default destructor */
-    ~CpuDepthwiseConvolutionAssemblyDispatch();
+    ~CpuDepthwiseConv2dAssemblyDispatch();
+
     /** Initialize the function's source, destination, kernels and border_size.
      *
      * @note Supports only NHWC format
      *
-     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src.
      * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input.
-     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
+     *                     Data type supported: Same as @p src.
+     * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      */
-    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionAssemblyDispatch
-     *
-     * @note Supports only NHWC format
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input.
-     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]  info    Depthwise convolution meta-data.
+     * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
      *
-     * @return An error status
+     * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
     /** Check if the optimized kernel can be used for the given kernel sizes and strides
      *
      * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
      *
-     * @param[in] input   Input tensor info.
+     * @param[in] src     Input tensor info.
      * @param[in] weights Weights tensor info.
      * @param[in] info    Depthwise convolution meta-data.
      *
      * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
      */
-    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo &info);
+    static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
@@ -94,4 +83,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
deleted file mode 100644
index e39cb7db4d..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZATION_H
-#define ARM_COMPUTE_CPU_DEQUANTIZATION_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to execute a depthwise convolution.
- */
-class CpuDepthwiseConvolution : public ICpuOperator
-{
-public:
-    /** Default constructor */
-    CpuDepthwiseConvolution();
-    /** Initialize the function's source, destination, weights and convolution information.
-     *
-     * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
-     *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      info    Depthwise convolution meta-data.
-     */
-    void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution
-     *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in] weights Weights tensor info. These are 3D tensors info with shape [kernel_x, kernel_y, IFM].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConvolution
-     *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor. Data type supported: same as @p input.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a Depthwise Convolution Function
-     */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                          const ConvolutionInfo &info);
-
-    // Inherited methods overriden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-
-private:
-    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
-    *
-    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
-    *
-    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
-    * -# @ref CpuDepthwiseConvolution3x3Kernel if 3x3 and no assembly kernel implementation is present
-    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
-    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
-    * -# @ref NEActivationLayer if fused activation is required
-    *
-    */
-    class CpuDepthwiseConvolutionOptimizedInternal : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConvolutionOptimizedInternal();
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionOptimizedInternal(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConvolutionOptimizedInternal(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionOptimizedInternal &operator=(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConvolutionOptimizedInternal &operator=(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConvolutionOptimizedInternal() = default;
-        /** Initialize the function's source, destination, kernels and border_size.
-         *
-         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution3x3
-         *
-         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in] info    Depthwise convolution meta-data.
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-        // Inherited methods overriden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<CpuDepthwiseConvolutionAssemblyDispatch> _dwc_optimized_func{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
-        bool                                                     _has_bias{ false };
-        bool                                                     _is_quantized{ false };
-        bool                                                     _is_nchw{ true };
-        bool                                                     _permute{ false };
-        bool                                                     _is_activationlayer_enabled{ false };
-        bool                                                     _is_prepared{ false };
-    };
-
-    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
-     *
-     * -# @ref CpuDepthwiseConvolutionNativeKernel
-     *
-     */
-    class CpuDepthwiseConvolutionGeneric : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConvolutionGeneric();
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionGeneric(const CpuDepthwiseConvolutionGeneric &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConvolutionGeneric(CpuDepthwiseConvolutionGeneric &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionGeneric &operator=(const CpuDepthwiseConvolutionGeneric &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConvolutionGeneric &operator=(CpuDepthwiseConvolutionGeneric &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConvolutionGeneric() = default;
-        /** Initialize the function's source, destination, weights and convolution information.
-         *
-         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionGeneric
-         *
-         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] info    Depthwise convolution meta-data.
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-        // Inherited methods overridden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<kernels::CpuDepthwiseConvolutionNativeKernel> _depthwise_conv_kernel{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                                _activationlayer_function{ nullptr };
-        bool                                                          _is_nchw{ true };
-        bool                                                          _is_prepared{ false };
-        bool                                                          _is_activationlayer_enabled{ false };
-    };
-
-    DepthwiseConvolutionFunction             _depth_conv_func;
-    CpuDepthwiseConvolutionOptimizedInternal _func_optimized;
-    CpuDepthwiseConvolutionGeneric           _func_generic;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZATION_H */
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
index 33f79603e8..8812b777a3 100644
--- a/src/runtime/cpu/operators/CpuDirectConvolution.cpp
+++ b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
+#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
@@ -32,19 +32,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuDirectConvolution::~CpuDirectConvolution() = default;
+CpuDirectConv2d::~CpuDirectConv2d() = default;
 
-CpuDirectConvolution::CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager)
+CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
       _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
 {
 }
 
-void CpuDirectConvolution::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConvolutionOutputStageKernel>();
-    _conv_kernel          = std::make_unique<kernels::CpuDirectConvolutionKernel>();
+    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>();
+    _conv_kernel          = std::make_unique<kernels::CpuDirectConv2dKernel>();
     _input_border_handler = std::make_unique<NEFillBorderKernel>();
 
     // Free accumulator
@@ -80,8 +80,8 @@ void CpuDirectConvolution::configure(ITensorInfo *src, ITensorInfo *weights, con
     }
 }
 
-Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info)
+Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -90,7 +90,7 @@ Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo
     TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
 
     // Validate Convolution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionKernel::validate(src, weights, &accumulator, conv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
 
     if(bias != nullptr)
     {
@@ -101,7 +101,7 @@ Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo
     }
 
     // Validate bias kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionOutputStageKernel::validate(&accumulator, bias, dst));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
 
     if(act_info.enabled())
     {
@@ -111,7 +111,7 @@ Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo
     return Status{};
 }
 
-void CpuDirectConvolution::run(ITensorPack &tensors)
+void CpuDirectConv2d::run(ITensorPack &tensors)
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.h b/src/runtime/cpu/operators/CpuDirectConv2d.h
index 0635e087fd..9e584b9c49 100644
--- a/src/runtime/cpu/operators/CpuDirectConvolution.h
+++ b/src/runtime/cpu/operators/CpuDirectConv2d.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -33,8 +33,8 @@
 #include "arm_compute/runtime/Tensor.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
 #include "src/runtime/cpu/ICpuOperator.h"
 #include "src/runtime/cpu/operators/CpuActivation.h"
 
@@ -49,16 +49,16 @@ namespace cpu
  *  This function calls the following kernels:
  *
  * -# @ref NEFillBorderKernel for the input
- * -# @ref kernels::CpuDirectConvolutionOutputStageKernel
- * -# @ref kernels::CpuDirectConvolutionKernel
+ * -# @ref kernels::CpuDirectConv2dOutputStageKernel
+ * -# @ref kernels::CpuDirectConv2dKernel
  */
-class CpuDirectConvolution : public ICpuOperator
+class CpuDirectConv2d : public ICpuOperator
 {
 public:
     /** Constructor */
-    CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Destructor */
-    ~CpuDirectConvolution();
+    ~CpuDirectConv2d();
     /** Set the input, weights, biases and output tensors.
      *
      * @note: DirectConvolution only works in the following configurations:
@@ -78,23 +78,9 @@ public:
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
     void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @note: DirectConvolution only works in the following configurations:
-     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
-     *
-     * @param[in] src       Input tensor info. Data types supported: F16/F32.
-     * @param[in] weights   Set of kernels to convolve the input volume.
-     *                      Supported sizes: 1x1, 3x3 and 5x5.
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported: Same as @p src.
-     * @param[in] bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
-     * @param[in] dst       Output tensor info.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
+     * Similar to CpuDirectConv2d::configure()
      *
      * @return a status
      */
@@ -105,17 +91,17 @@ public:
     void run(ITensorPack &tensors) override;
 
 private:
-    MemoryGroup                                                     _memory_group;
-    std::unique_ptr<kernels::CpuDirectConvolutionOutputStageKernel> _output_stage_kernel;
-    std::unique_ptr<kernels::CpuDirectConvolutionKernel>            _conv_kernel;
-    std::unique_ptr<NEFillBorderKernel>                             _input_border_handler;
-    std::unique_ptr<CpuActivation>                                  _activationlayer_function;
-    Tensor                                                          _accumulator;
-    bool                                                            _has_bias{ false };
-    bool                                                            _is_activationlayer_enabled{ false };
-    unsigned int                                                    _dim_split{ 0 };
-    bool                                                            _is_padding_required{ false };
+    MemoryGroup                                                _memory_group;
+    std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel;
+    std::unique_ptr<kernels::CpuDirectConv2dKernel>            _conv_kernel;
+    std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
+    std::unique_ptr<CpuActivation>                             _activationlayer_function;
+    Tensor                                                     _accumulator;
+    bool                                                       _has_bias{ false };
+    bool                                                       _is_activationlayer_enabled{ false };
+    unsigned int                                               _dim_split{ 0 };
+    bool                                                       _is_padding_required{ false };
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H */
+#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuPooling.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp
index 3a6ac24a74..b225199c40 100644
--- a/src/runtime/cpu/operators/CpuPooling.cpp
+++ b/src/runtime/cpu/operators/CpuPool2d.cpp
@@ -21,20 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/runtime/cpu/operators/CpuPooling.h"
+#include "src/runtime/cpu/operators/CpuPool2d.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
-#include "src/core/cpu/kernels/CpuPoolingKernel.h"
+#include "src/core/cpu/kernels/CpuPool2dKernel.h"
+#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-CpuPooling::CpuPooling()
+CpuPool2d::CpuPool2d()
     : _pooling_layer_kernel(),
       _border_handler(),
       _asm_glue(),
@@ -44,12 +44,12 @@ CpuPooling::CpuPooling()
 {
 }
 
-CpuPooling::~CpuPooling() = default;
+CpuPool2d::~CpuPool2d() = default;
 
-void CpuPooling::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
     // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
     // Get data layout
     _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
@@ -64,7 +64,7 @@ void CpuPooling::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLaye
         const CPUInfo     &ci          = NEScheduler::get().cpu_info();
         const unsigned int num_threads = NEScheduler::get().num_threads();
 
-        auto pooling_wrapper = std::make_unique<kernels::CpuPoolingAssemblyWrapperKernel>();
+        auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
         ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
         pooling_wrapper->configure(src, dst, pool_info, ci);
 
@@ -78,7 +78,7 @@ void CpuPooling::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLaye
     else
     {
         // Configure pooling kernel
-        auto k = std::make_unique<kernels::CpuPoolingKernel>();
+        auto k = std::make_unique<kernels::CpuPool2dKernel>();
         k->configure(src, dst, pool_info, indices);
         _pooling_layer_kernel = std::move(k);
 
@@ -106,19 +106,19 @@ void CpuPooling::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLaye
     }
 }
 
-Status CpuPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
     if(run_optimised)
     {
         return Status{};
     }
 
-    return kernels::CpuPoolingKernel::validate(src, dst, pool_info, indices);
+    return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices);
 }
 
-void CpuPooling::run(ITensorPack &tensors)
+void CpuPool2d::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
 
@@ -148,7 +148,7 @@ void CpuPooling::run(ITensorPack &tensors)
     }
 }
 
-experimental::MemoryRequirements CpuPooling::workspace() const
+experimental::MemoryRequirements CpuPool2d::workspace() const
 {
     return _mem_req;
 }
diff --git a/src/runtime/cpu/operators/CpuPooling.h b/src/runtime/cpu/operators/CpuPool2d.h
index bc30adf762..ae3d115dfc 100644
--- a/src/runtime/cpu/operators/CpuPooling.h
+++ b/src/runtime/cpu/operators/CpuPool2d.h
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_POOLING_H
-#define ARM_COMPUTE_CPU_POOLING_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
+#ifndef ARM_COMPUTE_CPU_POOL2D_H
+#define ARM_COMPUTE_CPU_POOL2D_H
 
 #include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/runtime/cpu/ICpuOperator.h"
 
 #include <memory>
 
@@ -40,24 +40,17 @@ namespace cpu
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
  *
  * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref kernels::CpuPoolingKernel
- * -# @ref kernels::CpuPoolingAssemblyWrapperKernel
+ * -# @ref kernels::CpuPool2dKernel
+ * -# @ref kernels::CpuPool2dAssemblyWrapperKernel
  */
-class CpuPooling : public ICpuOperator
+class CpuPool2d : public ICpuOperator
 {
 public:
     /** Constructor */
-    CpuPooling();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuPooling(const CpuPooling &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuPooling &operator=(const CpuPooling &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuPooling(CpuPooling &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuPooling &operator=(CpuPooling &&) = delete;
+    CpuPool2d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d);
     /** Default destructor */
-    ~CpuPooling();
+    ~CpuPool2d();
     /** Set the src and dst tensors.
      *
      * @note F16 is supported for pool sizes 2 and 3 only
@@ -68,14 +61,9 @@ public:
      * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
     void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPooling
-     *
-     * @note F16 is supported for pool sizes 2 and 3 only
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
+     * Similar to CpuPool2d::configure()
      *
      * @return a status
      */
@@ -96,4 +84,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOLING_H */
+#endif /* ARM_COMPUTE_CPU_POOL2D_H */
diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
index 3382a6c3c5..527b3a65f9 100644
--- a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp
+++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h"
+#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/gpu/cl/ClCompileContext.h"
 #include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h"
+#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
 namespace arm_compute
 {
@@ -44,11 +44,11 @@ ITensorPack select_activation_src_dst(ITensorPack &tensors)
 }
 } // namespace
 
-void ClDirectConvolution::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                    const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     // Configure direct convolution kernel
-    auto k = std::make_unique<kernels::ClDirectConvolutionKernel>();
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, weights, biases, dst, conv_info);
     _direct_conv_kernel = std::move(k);
@@ -74,10 +74,10 @@ void ClDirectConvolution::configure(const CLCompileContext &compile_context, ITe
     CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
 }
 
-Status ClDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConvolutionKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target()));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target()));
     if(act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
@@ -85,7 +85,7 @@ Status ClDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *
     return Status{};
 }
 
-void ClDirectConvolution::run(ITensorPack &tensors)
+void ClDirectConv2d::run(ITensorPack &tensors)
 {
     // Run border handler
     CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.h b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
index e7ad927b0b..e069733fab 100644
--- a/src/runtime/gpu/cl/operators/ClDirectConvolution.h
+++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H
-#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
 
 #include "src/core/gpu/cl/ClCompileContext.h"
 #include "src/core/gpu/cl/IClKernel.h"
@@ -37,13 +37,13 @@ namespace opencl
 /** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
  *
  * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClDirectConvolution
+ * -# @ref opencl::ClDirectConv2d
  */
-class ClDirectConvolution : public IClOperator
+class ClDirectConv2d : public IClOperator
 {
 public:
     /** Constructor */
-    ClDirectConvolution() = default;
+    ClDirectConv2d() = default;
     /** Set the src and dst tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -61,18 +61,9 @@ public:
      */
     void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolution
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of srcs.
-     *                      Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
-     * @param[in] biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                      Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst       Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                      Data types supported: Same as @p src.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
+     * Similar to ClDirectConv2d::configure()
      *
      * @return a status
      */
@@ -89,4 +80,4 @@ private:
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
+\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPooling.cpp b/src/runtime/gpu/cl/operators/ClPool2d.cpp
index 8610eb9842..40c2b0a8ba 100644
--- a/src/runtime/gpu/cl/operators/ClPooling.cpp
+++ b/src/runtime/gpu/cl/operators/ClPool2d.cpp
@@ -21,23 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/runtime/gpu/cl/operators/ClPooling.h"
+#include "src/runtime/gpu/cl/operators/ClPool2d.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClPoolingKernel.h"
+#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPooling::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     // Configure pooling kernel
-    auto k = std::make_unique<kernels::ClPoolingKernel>();
+    auto k = std::make_unique<kernels::ClPool2dKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, dst, info, indices);
     _pooling = std::move(k);
@@ -85,12 +85,12 @@ void ClPooling::configure(const ClCompileContext &compile_context, ITensorInfo *
     CLScheduler::get().tune_kernel_static(*_pooling);
 }
 
-Status ClPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
 {
-    return kernels::ClPoolingKernel::validate(src, dst, info, indices);
+    return kernels::ClPool2dKernel::validate(src, dst, info, indices);
 }
 
-void ClPooling::run(ITensorPack &tensors)
+void ClPool2d::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
 
diff --git a/src/runtime/gpu/cl/operators/ClPooling.h b/src/runtime/gpu/cl/operators/ClPool2d.h
index 99de6d0dcf..8ac386a64b 100644
--- a/src/runtime/gpu/cl/operators/ClPooling.h
+++ b/src/runtime/gpu/cl/operators/ClPool2d.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_POOLING_H
-#define ARM_COMPUTE_CL_POOLING_H
+#ifndef ARM_COMPUTE_CL_POOL2D_H
+#define ARM_COMPUTE_CL_POOL2D_H
 
 #include "src/core/gpu/cl/ClCompileContext.h"
 #include "src/runtime/gpu/cl/IClOperator.h"
@@ -36,13 +36,13 @@ namespace opencl
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
  *
  * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClPooling
+ * -# @ref opencl::ClPool2d
  */
-class ClPooling : public IClOperator
+class ClPool2d : public IClOperator
 {
 public:
     /** Constructor */
-    ClPooling() = default;
+    ClPool2d() = default;
     /** Configure operator for a given list of arguments
      *
      * @param[in]  compile_context The compile context to be used.
@@ -52,12 +52,9 @@ public:
      * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
      */
     void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClPooling
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in]  src     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst     Destination tensor info. Data type supported: same as @p src
-     * @param[in]  info    Pooling layer parameters.
-     * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32.
+     * Similar to ClPool2d::configure()
      *
      * @return a status
      */
@@ -72,4 +69,4 @@ private:
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_POOLING_H */
+#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index 5c2ebaa51f..ddf3faacb6 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
@@ -38,7 +38,7 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 // Create function for CpuDepthwiseConvolutionKernel
-using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>;
+using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConv2dNativeKernel>;
 
 // Fixture for NEDepthwiseConvolutionLayerKernel
 template <typename T>
@@ -124,7 +124,7 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
     auto biases  = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
     auto dst     = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
 
-    cpu::kernels::CpuDepthwiseConvolutionNativeKernel dwc;
+    cpu::kernels::CpuDepthwiseConv2dNativeKernel dwc;
     const ConvolutionInfo info{pad_stride_info, 1, ActivationLayerInfo(), Size2D(1, 1)};
     dwc.configure(src.info(), weights.info(), biases.info(), dst.info(), info);
author	Manuel Bottini <manuel.bottini@arm.com>	2021-05-24 16:01:32 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-06-01 12:52:48 +0000
commit	b4bb6a03f717a320b935809fde795b3d6ec5a69f (patch)
tree	9c7913282579995d91e79c1a3486605c28dfa595
parent	433ea4981675b64c44c8f47f2f4aac6bfcbfc911 (diff)
download	ComputeLibrary-b4bb6a03f717a320b935809fde795b3d6ec5a69f.tar.gz