From b4bb6a03f717a320b935809fde795b3d6ec5a69f Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Mon, 24 May 2021 16:01:32 +0100
Subject: Rename ported functions

Rename CpuPooling to CpuPool2d
Rename CpuPoolingKernel to CpuPool2dKernel
Rename CpuPoolingAssemblyWrapperKernel to CpuPool2dAssemblyWrapperKernel
Move CpuPool2dAssemblyWrapperKernel in internal subfolder
Rename CpuDepthwiseConvolutionNativeKernel to CpuDepthwiseConv2dNativeKernel
Rename CpuDepthwiseConvolutionAssemblyDispatch to CpuDepthwiseConv2dAssemblyDispatch
Rename CpuDepthwiseConvolution to CpuDepthwiseConv2d
Rename CpuDirectConvolutionKernel to CpuDirectConv2dKernel
Rename CpuDirectConvolutionOutputStageKernel to CpuDirectConv2dOutputStageKernel
Rename CpuDirectConvolution to CpuDirectConv2d
Rename ClPoolingKernel to ClPool2dKernel
Rename ClPooling to ClPool2d
Rename ClDirectConvolutionKernel to ClDirectConv2dKernel

Resolves: COMPMID-4405

Change-Id: I8e48f015e4e492a76a7512f5679cb3eb0cd028f6
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5708
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../CL/functions/CLDirectConvolutionLayer.cpp      |  16 +-
 src/runtime/CL/functions/CLPoolingLayer.cpp        |  14 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |  66 +--
 .../NEON/functions/NEDirectConvolutionLayer.cpp    |  16 +-
 src/runtime/NEON/functions/NEPoolingLayer.cpp      |  16 +-
 src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp   | 523 +++++++++++++++++++
 src/runtime/cpu/operators/CpuDepthwiseConv2d.h     | 213 ++++++++
 .../CpuDepthwiseConv2dAssemblyDispatch.cpp         | 563 +++++++++++++++++++++
 .../operators/CpuDepthwiseConv2dAssemblyDispatch.h |  86 ++++
 .../cpu/operators/CpuDepthwiseConvolution.cpp      | 523 -------------------
 .../cpu/operators/CpuDepthwiseConvolution.h        | 230 ---------
 .../CpuDepthwiseConvolutionAssemblyDispatch.cpp    | 563 ---------------------
 .../CpuDepthwiseConvolutionAssemblyDispatch.h      |  97 ----
 src/runtime/cpu/operators/CpuDirectConv2d.cpp      | 147 ++++++
 src/runtime/cpu/operators/CpuDirectConv2d.h        | 107 ++++
 src/runtime/cpu/operators/CpuDirectConvolution.cpp | 147 ------
 src/runtime/cpu/operators/CpuDirectConvolution.h   | 121 -----
 src/runtime/cpu/operators/CpuPool2d.cpp            | 156 ++++++
 src/runtime/cpu/operators/CpuPool2d.h              |  87 ++++
 src/runtime/cpu/operators/CpuPooling.cpp           | 156 ------
 src/runtime/cpu/operators/CpuPooling.h             |  99 ----
 src/runtime/gpu/cl/operators/ClDirectConv2d.cpp    | 102 ++++
 src/runtime/gpu/cl/operators/ClDirectConv2d.h      |  83 +++
 .../gpu/cl/operators/ClDirectConvolution.cpp       | 102 ----
 src/runtime/gpu/cl/operators/ClDirectConvolution.h |  92 ----
 src/runtime/gpu/cl/operators/ClPool2d.cpp          | 101 ++++
 src/runtime/gpu/cl/operators/ClPool2d.h            |  72 +++
 src/runtime/gpu/cl/operators/ClPooling.cpp         | 101 ----
 src/runtime/gpu/cl/operators/ClPooling.h           |  75 ---
 29 files changed, 2304 insertions(+), 2370 deletions(-)
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.h
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.h
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
 create mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.h
 delete mode 100644 src/runtime/cpu/operators/CpuDirectConvolution.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuDirectConvolution.h
 create mode 100644 src/runtime/cpu/operators/CpuPool2d.cpp
 create mode 100644 src/runtime/cpu/operators/CpuPool2d.h
 delete mode 100644 src/runtime/cpu/operators/CpuPooling.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuPooling.h
 create mode 100644 src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClDirectConv2d.h
 delete mode 100644 src/runtime/gpu/cl/operators/ClDirectConvolution.cpp
 delete mode 100644 src/runtime/gpu/cl/operators/ClDirectConvolution.h
 create mode 100644 src/runtime/gpu/cl/operators/ClPool2d.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClPool2d.h
 delete mode 100644 src/runtime/gpu/cl/operators/ClPooling.cpp
 delete mode 100644 src/runtime/gpu/cl/operators/ClPooling.h

(limited to 'src/runtime')

diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 74867ff64f..907e69d8d7 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -29,17 +29,17 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/runtime/gpu/cl/operators/ClActivation.h"
-#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h"
+#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLDirectConvolutionLayer::Impl
 {
-    const ICLTensor                             *src{ nullptr };
-    const ICLTensor                             *weights{ nullptr };
-    const ICLTensor                             *biases{ nullptr };
-    ICLTensor                                   *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConvolution> op{ nullptr };
+    const ICLTensor                        *src{ nullptr };
+    const ICLTensor                        *weights{ nullptr };
+    const ICLTensor                        *biases{ nullptr };
+    ICLTensor                              *dst{ nullptr };
+    std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
 };
 
 CLDirectConvolutionLayer::CLDirectConvolutionLayer()
@@ -65,14 +65,14 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
     _impl->biases  = biases;
     _impl->dst     = output;
 
-    _impl->op = std::make_unique<opencl::ClDirectConvolution>();
+    _impl->op = std::make_unique<opencl::ClDirectConv2d>();
     _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
 Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    return opencl::ClDirectConvolution::validate(input, weights, biases, output, conv_info, act_info);
+    return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
 }
 
 void CLDirectConvolutionLayer::run()
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index fbaec1d2d9..7ba911c342 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -26,16 +26,16 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPooling.h"
+#include "src/runtime/gpu/cl/operators/ClPool2d.h"
 
 namespace arm_compute
 {
 struct CLPoolingLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    ICLTensor                         *indices{ nullptr };
-    std::unique_ptr<opencl::ClPooling> op{ nullptr };
+    const ICLTensor                  *src{ nullptr };
+    ICLTensor                        *dst{ nullptr };
+    ICLTensor                        *indices{ nullptr };
+    std::unique_ptr<opencl::ClPool2d> op{ nullptr };
 };
 
 CLPoolingLayer::CLPoolingLayer()
@@ -55,13 +55,13 @@ void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTenso
     _impl->dst     = output;
     _impl->indices = indices;
 
-    _impl->op = std::make_unique<opencl::ClPooling>();
+    _impl->op = std::make_unique<opencl::ClPool2d>();
     _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return opencl::ClPooling::validate(input, output, pool_info, indices);
+    return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }
 
 void CLPoolingLayer::run()
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index da9610ef42..a561b88058 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
@@ -47,15 +47,15 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
     const ITensor *biases
     {
         nullptr
-    };                                                                // SRC_2
-    Tensor                                        permuted_input{};   // INT_0
-    Tensor                                        permuted_weights{}; // INT_1
-    Tensor                                        permuted_output{};  // INT_2
-    Tensor                                        workspace{};        // INT_3
-    Tensor                                        packed_weights{};   // INT_4
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
-    bool                                          is_prepared{ false };
-    bool                                          permute{ false };
+    };                                                           // SRC_2
+    Tensor                                   permuted_input{};   // INT_0
+    Tensor                                   permuted_weights{}; // INT_1
+    Tensor                                   permuted_output{};  // INT_2
+    Tensor                                   workspace{};        // INT_3
+    Tensor                                   packed_weights{};   // INT_4
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+    bool                                     is_prepared{ false };
+    bool                                     permute{ false };
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
@@ -80,7 +80,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     _impl->dst     = output;
     _impl->permute = is_nhwc;
 
-    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
                          _impl->dst->info(), info);
@@ -97,7 +97,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     }
     info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
 
-    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConvolutionAssemblyDispatch>();
+    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
     if(is_nhwc)
     {
@@ -154,7 +154,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
                                                                                            const Size2D              &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
@@ -197,17 +197,17 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
 
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
 {
-    Tensor                                        permuted_input{};
-    Tensor                                        permuted_weights{};
-    Tensor                                        permuted_output{};
-    bool                                          is_prepared{ false };
-    bool                                          is_nchw{ false };
-    bool                                          is_activationlayer_enabled{ false };
-    const ITensor                                *weights{ nullptr };
-    const ITensor                                *biases{ nullptr };
-    const ITensor                                *src{ nullptr };
-    ITensor                                      *dst{ nullptr };
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+    Tensor                                   permuted_input{};
+    Tensor                                   permuted_weights{};
+    Tensor                                   permuted_output{};
+    bool                                     is_prepared{ false };
+    bool                                     is_nchw{ false };
+    bool                                     is_activationlayer_enabled{ false };
+    const ITensor                           *weights{ nullptr };
+    const ITensor                           *biases{ nullptr };
+    const ITensor                           *src{ nullptr };
+    ITensor                                 *dst{ nullptr };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -223,7 +223,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
 
     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
 
     _impl->src         = input;
@@ -253,7 +253,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
         output_to_use = &_impl->permuted_output;
     }
 
-    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
     if(_impl->is_nchw)
@@ -273,7 +273,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate
                                                                                  unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
@@ -298,10 +298,10 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 #ifndef DOXYGEN_SKIP_THIS
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    DepthwiseConvolutionFunction                  depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
-    NEDepthwiseConvolutionLayerOptimizedInternal  func_optimized{ nullptr };
-    NEDepthwiseConvolutionLayerGeneric            func_generic{};
-    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+    DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+    NEDepthwiseConvolutionLayerGeneric           func_generic{};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
 };
 #endif // DOXYGEN_SKIP_THIS
 
@@ -309,7 +309,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
                                             const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op              = std::make_shared<cpu::CpuDepthwiseConvolution>();
+    _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
     _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
                                                                           info);
     switch(_impl->depth_conv_func)
@@ -329,7 +329,7 @@ Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
                                              unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 73834381c6..58530e4a8f 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,17 +27,17 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
+#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
 struct NEDirectConvolutionLayer::Impl
 {
-    ITensor                                   *src{ nullptr };
-    const ITensor                             *weights{ nullptr };
-    const ITensor                             *bias{ nullptr };
-    ITensor                                   *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDirectConvolution> op{ nullptr };
+    ITensor                              *src{ nullptr };
+    const ITensor                        *weights{ nullptr };
+    const ITensor                        *bias{ nullptr };
+    ITensor                              *dst{ nullptr };
+    std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
 };
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -52,14 +52,14 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
     _impl->weights = weights;
     _impl->bias    = bias;
     _impl->dst     = output;
-    _impl->op      = std::make_unique<cpu::CpuDirectConvolution>(_memory_manager);
+    _impl->op      = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
     _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
 }
 
 Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    return cpu::CpuDirectConvolution::validate(input, weights, bias, output, conv_info, act_info);
+    return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
 }
 
 void NEDirectConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 1570cdeedc..bbf3e7cc4e 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -26,17 +26,17 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuPooling.h"
+#include "src/runtime/cpu/operators/CpuPool2d.h"
 
 namespace arm_compute
 {
 struct NEPoolingLayer::Impl
 {
-    ITensor                         *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    ITensor                         *indices{ nullptr };
-    Tensor                           workspace{ nullptr };
-    std::unique_ptr<cpu::CpuPooling> op{ nullptr };
+    ITensor                        *src{ nullptr };
+    ITensor                        *dst{ nullptr };
+    ITensor                        *indices{ nullptr };
+    Tensor                          workspace{ nullptr };
+    std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
 };
 
 NEPoolingLayer::~NEPoolingLayer() = default;
@@ -51,7 +51,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     _impl->src     = input;
     _impl->dst     = output;
     _impl->indices = indices;
-    _impl->op      = std::make_unique<cpu::CpuPooling>();
+    _impl->op      = std::make_unique<cpu::CpuPool2d>();
     _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 
     // Allocate workspace based on kernel's memory requirements
@@ -66,7 +66,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return cpu::CpuPooling::validate(input, output, pool_info, indices);
+    return cpu::CpuPool2d::validate(input, output, pool_info, indices);
 }
 
 void NEPoolingLayer::run()
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..160a9fd70b
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    if(!is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+                                info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+                                info.pad_stride_info.pad_bottom());
+
+    if(biases != nullptr)
+    {
+        const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
+
+    //Validate Activation Layer
+    if(info.act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+    }
+    return Status{};
+}
+} // namespace
+
+CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::CpuDepthwiseConv2dOptimizedInternal()
+    : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo           *src,
+                                                                        const ITensorInfo     *weights,
+                                                                        const ITensorInfo     *biases,
+                                                                        ITensorInfo           *dst,
+                                                                        const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+                                                                             dst, info));
+
+    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+    _has_bias     = biases != nullptr;
+    _is_nchw      = src->data_layout() == DataLayout::NCHW;
+    _permute      = _is_nchw;
+    _is_prepared  = false;
+
+    // Configure pipeline
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(info.act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+    _is_activationlayer_enabled         = info.act_info.enabled() && !(is_relu || is_relu6);
+
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = info.act_info;
+    }
+
+    _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
+    if(_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+        _permute_output  = std::make_unique<cpu::CpuPermute>();
+
+        auto input_perm   = std::make_unique<TensorInfo>();
+        auto weights_perm = std::make_unique<TensorInfo>();
+        auto output_perm  = std::make_unique<TensorInfo>();
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+
+        output_perm->set_data_layout(DataLayout::NHWC);
+        output_perm->set_quantization_info(dst->quantization_info());
+
+        // Configure optimized depthwise
+        _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        output_perm->set_data_layout(DataLayout::NHWC);
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+    }
+    else
+    {
+        _dwc_optimized_func->configure(src, weights, biases, dst, info);
+    }
+
+    // Configure activation
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo     *src,
+                                                                         const ITensorInfo     *weights,
+                                                                         const ITensorInfo     *biases,
+                                                                         const ITensorInfo     *dst,
+                                                                         const ConvolutionInfo &info)
+{
+    return validate_arguments_optimized(src, weights, biases, dst, info);
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    prepare(tensors);
+
+    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst            = tensors.get_tensor(TensorType::ACL_DST_0);
+    auto workspace      = tensors.get_tensor(TensorType::ACL_INT_3);
+    auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+    // Permute input
+    if(_permute)
+    {
+        ITensorPack pack;
+        auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+        auto        src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+    }
+
+    // Run assembly function
+    if(_is_nchw)
+    {
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst_perm);
+        _dwc_optimized_func->run(pack);
+    }
+    else
+    {
+        auto src     = tensors.get_tensor(TensorType::ACL_SRC_0);
+        auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
+        auto dst     = tensors.get_tensor(TensorType::ACL_DST);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _dwc_optimized_func->run(pack);
+    }
+
+    // Permute output
+    if(_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    // Run activation
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+        // Permute weights
+        if(_permute)
+        {
+            auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, weights);
+            pack.add_tensor(TensorType::ACL_DST, permuted_weights);
+            _permute_weights->run(pack);
+
+            weights->mark_as_unused();
+
+            ITensorPack pack_opt;
+            pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+        else
+        {
+            ITensorPack pack_opt;
+            pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+
+        _is_prepared = true;
+    }
+}
+
+CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::CpuDepthwiseConv2dGeneric()
+    : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
+      _is_activationlayer_enabled(false)
+{
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+                                                            dst, info));
+
+    _is_nchw     = src->data_layout() == DataLayout::NCHW;
+    _is_prepared = !_is_nchw;
+
+    ITensorInfo       *input_to_use   = src;
+    const ITensorInfo *weights_to_use = weights;
+    ITensorInfo       *output_to_use  = dst;
+
+    auto input_perm   = std::make_unique<TensorInfo>();
+    auto weights_perm = std::make_unique<TensorInfo>();
+    auto output_perm  = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+
+    if(_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+        input_to_use = input_perm.get();
+
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+        weights_to_use = weights_perm.get();
+
+        output_to_use = output_perm.get();
+    }
+
+    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
+    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
+
+    if(_is_nchw)
+    {
+        _permute_output = std::make_unique<cpu::CpuPermute>();
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+        output_perm->set_data_layout(DataLayout::NHWC);
+    }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = info.act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                               const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    if(src->data_layout() == DataLayout::NCHW)
+    {
+        TensorShape permuted_input_shape   = src->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+        const TensorInfo permuted_input   = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+    }
+
+    // Validate Activation Layer
+    if(info.act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
+{
+    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    if(_is_nchw)
+    {
+        prepare(tensors);
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+
+        ITensorPack pack_depth;
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+    }
+    else
+    {
+        ITensorPack pack_depth;
+        pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
+        pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+    }
+
+    if(_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        ARM_COMPUTE_ERROR_ON(!weights->is_used());
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, weights);
+        pack.add_tensor(TensorType::ACL_DST, weights_perm);
+
+        _permute_weights->run(pack);
+        weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+
+CpuDepthwiseConv2d::CpuDepthwiseConv2d()
+    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
+{
+}
+
+void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.configure(src, weights, biases, dst, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.configure(src, weights, biases, dst, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
+    switch(depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                                                   const ConvolutionInfo &info)
+{
+    if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+    {
+        return DepthwiseConvolutionFunction::OPTIMIZED;
+    }
+    else
+    {
+        return DepthwiseConvolutionFunction::GENERIC;
+    }
+}
+
+void CpuDepthwiseConv2d::run(ITensorPack &tensors)
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.run(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.run(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+
+void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.prepare(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.prepare(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
new file mode 100644
index 0000000000..049397fe60
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to execute a depthwise convolution.
+ */
+class CpuDepthwiseConv2d : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2d();
+    /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
+     *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      info    Depthwise convolution meta-data.
+     */
+    void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
+     *
+     * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                    Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                    Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] dst     Destination tensor. Data type supported: same as @p src.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return a Depthwise Convolution Function
+     */
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                                                          const ConvolutionInfo &info);
+
+    // Inherited methods overriden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+
+private:
+    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
+    *
+    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+    *
+    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+    * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
+    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required
+    * -# @ref NEActivationLayer if fused activation is required
+    *
+    */
+    class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dOptimizedInternal();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dOptimizedInternal() = default;
+        /** Initialize the function's source, destination, kernels and border_size.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+        // Inherited methods overriden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                         _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                      _activationlayer_function{ nullptr };
+        bool                                                _has_bias{ false };
+        bool                                                _is_quantized{ false };
+        bool                                                _is_nchw{ true };
+        bool                                                _permute{ false };
+        bool                                                _is_activationlayer_enabled{ false };
+        bool                                                _is_prepared{ false };
+    };
+
+    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
+     *
+     * -# @ref CpuDepthwiseConv2dNativeKernel
+     *
+     */
+    class CpuDepthwiseConv2dGeneric : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dGeneric();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dGeneric() = default;
+        /** Initialize the function's source, destination, weights and convolution information.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dGeneric::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+        // Inherited methods overridden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
+        bool                                                     _is_nchw{ true };
+        bool                                                     _is_prepared{ false };
+        bool                                                     _is_activationlayer_enabled{ false };
+    };
+
+    DepthwiseConvolutionFunction        _depth_conv_func;
+    CpuDepthwiseConv2dOptimizedInternal _func_optimized;
+    CpuDepthwiseConv2dGeneric           _func_generic;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
new file mode 100644
index 0000000000..a36ee1d45b
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
+                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
+                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
+                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
+                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                                  neon_convolution_kernels::ActivationFunction activation,
+                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
+                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
+                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
+                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
+                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *src,
+                                                                   const ITensorInfo     *weights,
+                                                                   ITensorInfo           *output,
+                                                                   const ConvolutionInfo &info)
+{
+    const DataType    data_type = src->data_type();
+    const TensorShape shape     = src->tensor_shape();
+
+    const int n_batches       = shape[3];
+    const int in_rows         = shape.z();
+    const int in_cols         = shape.y();
+    const int n_channels      = shape.x();
+    const int dilation_factor = info.dilation.x();
+    const int padding_top     = info.pad_stride_info.pad_top();
+    const int padding_left    = info.pad_stride_info.pad_left();
+    const int padding_bottom  = info.pad_stride_info.pad_bottom();
+    const int padding_right   = info.pad_stride_info.pad_right();
+
+    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
+    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+
+    const unsigned int stride_x    = info.pad_stride_info.stride().first;
+    const unsigned int kernel_size = weights->tensor_shape().y();
+
+    // Map activation function
+    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
+    if(arm_compute::utils::info_helpers::is_relu(info.act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU;
+    }
+    else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
+    }
+
+    // Create quantized convolver
+    if(is_uniform_quantized)
+    {
+        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
+        const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
+        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
+        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int32_t     qmultiplier = 0;
+        int32_t     qshift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
+
+        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
+                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+    }
+    else if(is_perchannel_quantized)
+    {
+        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
+        const QuantizationInfo        weights_qinfo = weights->quantization_info();
+        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
+        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        std::vector<float>   fmultipliers;
+        std::vector<int32_t> qmultipliers;
+        std::vector<int32_t> qshifts;
+
+        for(auto const s : wqinfo.scales)
+        {
+            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
+            int32_t     qmultiplier = 0;
+            int32_t     qshift      = 0;
+            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+            fmultipliers.push_back(fmultipler);
+            qmultipliers.push_back(qmultiplier);
+            qshifts.push_back(qshift);
+        }
+
+        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
+
+        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
+                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+    }
+    else
+    {
+        // Create float convolver
+        switch(data_type)
+        {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+            }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F32:
+            {
+                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+            }
+            default:
+                return nullptr;
+        }
+    }
+}
+} // namespace
+
+struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
+{
+    std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
+    NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
+    bool                                              is_prepared{ false };
+    experimental::MemoryRequirements                  mem_req{};
+};
+
+#ifndef DOXYGEN_SKIP_THIS
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
+    : _pImpl(std::make_unique<LocalImpl>())
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default;
+
+void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
+                                                   const ITensorInfo     *weights,
+                                                   const ITensorInfo     *bias,
+                                                   ITensorInfo           *dst,
+                                                   const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_UNUSED(bias);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src,
+                                                                            weights,
+                                                                            bias != nullptr ? bias : nullptr,
+                                                                            dst,
+                                                                            info));
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info()));
+
+    _pImpl->is_prepared = false;
+
+    // Create convolver
+    _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info);
+    ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
+
+    // Create assembly kernel wrapper
+    _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
+
+    constexpr size_t alignment = 128;
+
+    // Create workspace
+    const unsigned int num_threads    = NEScheduler::get().num_threads();
+    const size_t       workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
+    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
+
+    // Create packing tensor
+    const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
+    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
+
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
+}
+
+experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
+{
+    return _pImpl->mem_req;
+}
+
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
+                                                    const ITensorInfo     *weights,
+                                                    const ITensorInfo     *bias,
+                                                    const ITensorInfo     *dst,
+                                                    const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+
+    // Validate convolver
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info));
+
+    // Validate activation
+    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
+    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
+
+    // Check bias
+    if(bias != nullptr)
+    {
+        unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    // Check output
+    if(dst->total_size() != 0)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    // The uniform quantization case will only have 1 scale value in the weights quantization info
+    const UniformQuantizationInfo src_qinfo     = src->quantization_info().uniform();
+    const QuantizationInfo        weights_qinfo = weights->quantization_info();
+    const UniformQuantizationInfo dst_qinfo     = dst->quantization_info().uniform();
+    for(auto const s : weights_qinfo.scale())
+    {
+        const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale;
+        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
+    }
+
+    return Status{};
+}
+
+bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo     *src,
+                                                                const ITensorInfo     *weights,
+                                                                const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+
+    // Reshape input shape if in NHWC format
+    const DataLayout data_layout = src->data_layout();
+    TensorShape      in_shape{ src->tensor_shape() };
+    if(data_layout == DataLayout::NHWC)
+    {
+        in_shape.set(Window::DimX, src->tensor_shape().y());
+        in_shape.set(Window::DimY, src->tensor_shape().z());
+        in_shape.set(Window::DimZ, src->tensor_shape().x());
+    }
+
+    // Check data type
+    const DataType input_type            = src->data_type();
+    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
+    const DataType weights_type          = weights->data_type();
+    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
+                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
+
+    // Check weighs size
+    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
+    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int     kernel_w               = weights->dimension(width_idx);
+    const unsigned int     kernel_h               = weights->dimension(height_idx);
+    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
+
+    // Check for supported strides
+    const auto &strides           = info.pad_stride_info.stride();
+    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+    // Check for supported padding
+    const auto    pad_top           = info.pad_stride_info.pad_top();
+    const auto    pad_right         = info.pad_stride_info.pad_right();
+    const auto    pad_bottom        = info.pad_stride_info.pad_bottom();
+    const auto    pad_left          = info.pad_stride_info.pad_left();
+    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
+    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+    bool          supported_padding = is_same_padding || is_valid_padding;
+    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
+    bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
+
+    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
+    {
+        is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
+    }
+
+    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
+{
+    // Prepare assembly kernel
+    prepare(tensors);
+
+    auto src       = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+    auto dst       = tensors.get_tensor(TensorType::ACL_DST);
+
+    // Setup inputs/outputs
+    ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
+    _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
+
+    ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
+    const int   input_element_size = src->info()->element_size();
+    const int   input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
+    const int   input_row_stride   = src->info()->strides_in_bytes().z() / input_element_size;
+    const int   input_col_stride   = src->info()->strides_in_bytes().y() / input_element_size;
+    const void *input_ptr          = src->buffer() + src->info()->offset_first_element_in_bytes();
+    _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+    ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
+    const int output_element_size = dst->info()->element_size();
+    const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
+    const int output_row_stride   = dst->info()->strides_in_bytes().z() / output_element_size;
+    const int output_col_stride   = dst->info()->strides_in_bytes().y() / output_element_size;
+    void     *output_ptr          = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+
+    // Schedule assembly kernel
+    NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
+{
+    if(!_pImpl->is_prepared)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
+
+        // Pack weights and bias
+        const int weights_element_size = weights->info()->element_size();
+        const int weights_row_stride   = weights->info()->strides_in_bytes().z() / weights_element_size;
+        const int weights_col_stride   = weights->info()->strides_in_bytes().y() / weights_element_size;
+        _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
+                                                 weights->buffer() + weights->info()->offset_first_element_in_bytes(),
+                                                 weights_row_stride,
+                                                 weights_col_stride,
+                                                 (bias != nullptr) ? bias->buffer() : nullptr);
+        _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
+
+        weights->mark_as_unused();
+        if(bias != nullptr)
+        {
+            bias->mark_as_unused();
+        }
+        _pImpl->is_prepared = true;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
new file mode 100644
index 0000000000..195942b7fd
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
+
+#include "src/core/common/Macros.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Depthwise convolution assembly kernel glue */
+class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2dAssemblyDispatch();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
+    /** Default destructor */
+    ~CpuDepthwiseConv2dAssemblyDispatch();
+
+    /** Initialize the function's source, destination, kernels and border_size.
+     *
+     * @note Supports only NHWC format
+     *
+     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src.
+     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: Same as @p src.
+     * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
+     * @param[in]  info    Depthwise convolution meta-data.
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Check if the optimized kernel can be used for the given kernel sizes and strides
+     *
+     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
+     *
+     * @param[in] src     Input tensor info.
+     * @param[in] weights Weights tensor info.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
+     */
+    static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    struct LocalImpl;
+    std::unique_ptr<LocalImpl> _pImpl;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
deleted file mode 100644
index 6d097280e0..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > input->dimension(idx_w) + info.pad_stride_info.pad_left() +
-                                info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > input->dimension(idx_h) + info.pad_stride_info.pad_top() +
-                                info.pad_stride_info.pad_bottom());
-
-    if(biases != nullptr)
-    {
-        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, info));
-
-    //Validate Activation Layer
-    if(info.act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
-    }
-    return Status{};
-}
-} // namespace
-
-CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::CpuDepthwiseConvolutionOptimizedInternal()
-    : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
-      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
-{
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configure(ITensorInfo           *input,
-                                                                                  const ITensorInfo     *weights,
-                                                                                  const ITensorInfo     *biases,
-                                                                                  ITensorInfo           *output,
-                                                                                  const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, (biases == nullptr) ? nullptr : biases,
-                                                                                  output, info));
-
-    _is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    _has_bias     = biases != nullptr;
-    _is_nchw      = input->data_layout() == DataLayout::NCHW;
-    _permute      = _is_nchw;
-    _is_prepared  = false;
-
-    // Configure pipeline
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(info.act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(info.act_info);
-    _is_activationlayer_enabled         = info.act_info.enabled() && !(is_relu || is_relu6);
-
-    if(!_is_activationlayer_enabled)
-    {
-        act_info_to_use = info.act_info;
-    }
-
-    _dwc_optimized_func = std::make_unique<CpuDepthwiseConvolutionAssemblyDispatch>();
-    if(_is_nchw)
-    {
-        _permute_input   = std::make_unique<cpu::CpuPermute>();
-        _permute_weights = std::make_unique<cpu::CpuPermute>();
-        _permute_output  = std::make_unique<cpu::CpuPermute>();
-
-        auto input_perm   = std::make_unique<TensorInfo>();
-        auto weights_perm = std::make_unique<TensorInfo>();
-        auto output_perm  = std::make_unique<TensorInfo>();
-
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
-        input_perm->set_data_layout(DataLayout::NHWC);
-
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
-        weights_perm->set_data_layout(DataLayout::NHWC);
-
-        output_perm->set_data_layout(DataLayout::NHWC);
-        output_perm->set_quantization_info(output->quantization_info());
-
-        // Configure optimized depthwise
-        _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
-
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        output_perm->set_data_layout(DataLayout::NHWC);
-        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
-    }
-    else
-    {
-        _dwc_optimized_func->configure(input, weights, biases, output, info);
-    }
-
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(output, nullptr, info.act_info);
-    }
-}
-
-Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::validate(const ITensorInfo     *input,
-                                                                                   const ITensorInfo     *weights,
-                                                                                   const ITensorInfo     *biases,
-                                                                                   const ITensorInfo     *output,
-                                                                                   const ConvolutionInfo &info)
-{
-    return validate_arguments_optimized(input, weights, biases, output, info);
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    prepare(tensors);
-
-    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst            = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto workspace      = tensors.get_tensor(TensorType::ACL_INT_3);
-    auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
-    // Permute input
-    if(_permute)
-    {
-        ITensorPack pack;
-        auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-        auto        src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
-        pack.add_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, src_perm);
-        _permute_input->run(pack);
-    }
-
-    // Run assembly function
-    if(_is_nchw)
-    {
-        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
-        pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
-        pack.add_tensor(TensorType::ACL_SRC_2, bias);
-        pack.add_tensor(TensorType::ACL_INT_0, workspace);
-        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
-        pack.add_tensor(TensorType::ACL_DST, dst_perm);
-        _dwc_optimized_func->run(pack);
-    }
-    else
-    {
-        auto src     = tensors.get_tensor(TensorType::ACL_SRC_0);
-        auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
-        auto dst     = tensors.get_tensor(TensorType::ACL_DST);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, src);
-        pack.add_tensor(TensorType::ACL_SRC_1, weights);
-        pack.add_tensor(TensorType::ACL_SRC_2, bias);
-        pack.add_tensor(TensorType::ACL_INT_0, workspace);
-        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _dwc_optimized_func->run(pack);
-    }
-
-    // Permute output
-    if(_is_nchw)
-    {
-        ITensorPack pack;
-        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output->run(pack);
-    }
-
-    // Run activation
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
-        // Permute weights
-        if(_permute)
-        {
-            auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC, weights);
-            pack.add_tensor(TensorType::ACL_DST, permuted_weights);
-            _permute_weights->run(pack);
-
-            weights->mark_as_unused();
-
-            ITensorPack pack_opt;
-            pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
-            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
-            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
-            // Prepare optimized function
-            _dwc_optimized_func->prepare(pack_opt);
-        }
-        else
-        {
-            ITensorPack pack_opt;
-            pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
-            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
-            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
-            // Prepare optimized function
-            _dwc_optimized_func->prepare(pack_opt);
-        }
-
-        _is_prepared = true;
-    }
-}
-
-CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::CpuDepthwiseConvolutionGeneric()
-    : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
-      _is_activationlayer_enabled(false)
-{
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolution::validate(input, weights, (biases == nullptr) ? nullptr : biases,
-                                                                 output, info));
-
-    _is_nchw     = input->data_layout() == DataLayout::NCHW;
-    _is_prepared = !_is_nchw;
-
-    ITensorInfo       *input_to_use   = input;
-    const ITensorInfo *weights_to_use = weights;
-    ITensorInfo       *output_to_use  = output;
-
-    auto input_perm   = std::make_unique<TensorInfo>();
-    auto weights_perm = std::make_unique<TensorInfo>();
-    auto output_perm  = std::make_unique<TensorInfo>(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
-
-    if(_is_nchw)
-    {
-        _permute_input   = std::make_unique<cpu::CpuPermute>();
-        _permute_weights = std::make_unique<cpu::CpuPermute>();
-
-        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
-        input_perm->set_data_layout(DataLayout::NHWC);
-        input_to_use = input_perm.get();
-
-        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
-        weights_perm->set_data_layout(DataLayout::NHWC);
-        weights_to_use = weights_perm.get();
-
-        output_to_use = output_perm.get();
-    }
-
-    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
-    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
-
-    if(_is_nchw)
-    {
-        _permute_output = std::make_unique<cpu::CpuPermute>();
-        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
-        output_perm->set_data_layout(DataLayout::NHWC);
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = info.act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(output, nullptr, info.act_info);
-    }
-}
-
-Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                         const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
-        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(input, weights, biases, output, info));
-    }
-
-    // Validate Activation Layer
-    if(info.act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
-    }
-
-    return Status{};
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::run(ITensorPack &tensors)
-{
-    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
-
-    if(_is_nchw)
-    {
-        prepare(tensors);
-        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, src_perm);
-        _permute_input->run(pack);
-
-        ITensorPack pack_depth;
-        pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
-        pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
-        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
-        pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
-    }
-    else
-    {
-        ITensorPack pack_depth;
-        pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
-        pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
-        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
-        pack_depth.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
-    }
-
-    if(_is_nchw)
-    {
-        ITensorPack pack;
-        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output->run(pack);
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-
-void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-
-        ARM_COMPUTE_ERROR_ON(!weights->is_used());
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, weights);
-        pack.add_tensor(TensorType::ACL_DST, weights_perm);
-
-        _permute_weights->run(pack);
-        weights->mark_as_unused();
-        _is_prepared = true;
-    }
-}
-
-CpuDepthwiseConvolution::CpuDepthwiseConvolution()
-    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
-{
-}
-
-void CpuDepthwiseConvolution::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
-{
-    _depth_conv_func = get_depthwiseconvolution_function(input, weights, (biases != nullptr) ? biases : nullptr, output, info);
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.configure(input, weights, biases, output, info);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.configure(input, weights, biases, output, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-Status CpuDepthwiseConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
-{
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, info);
-    switch(depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            return CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            return CpuDepthwiseConvolutionGeneric::validate(input, weights, biases, output, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-DepthwiseConvolutionFunction CpuDepthwiseConvolution::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                        const ConvolutionInfo &info)
-{
-    if(bool(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info)))
-    {
-        return DepthwiseConvolutionFunction::OPTIMIZED;
-    }
-    else
-    {
-        return DepthwiseConvolutionFunction::GENERIC;
-    }
-}
-
-void CpuDepthwiseConvolution::run(ITensorPack &tensors)
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.run(tensors);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.run(tensors);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
-
-void CpuDepthwiseConvolution::prepare(ITensorPack &tensors)
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.prepare(tensors);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.prepare(tensors);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
deleted file mode 100644
index e39cb7db4d..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZATION_H
-#define ARM_COMPUTE_CPU_DEQUANTIZATION_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to execute a depthwise convolution.
- */
-class CpuDepthwiseConvolution : public ICpuOperator
-{
-public:
-    /** Default constructor */
-    CpuDepthwiseConvolution();
-    /** Initialize the function's source, destination, weights and convolution information.
-     *
-     * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
-     *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      info    Depthwise convolution meta-data.
-     */
-    void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution
-     *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in] weights Weights tensor info. These are 3D tensors info with shape [kernel_x, kernel_y, IFM].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConvolution
-     *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor. Data type supported: same as @p input.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a Depthwise Convolution Function
-     */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                          const ConvolutionInfo &info);
-
-    // Inherited methods overriden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-
-private:
-    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
-    *
-    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
-    *
-    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
-    * -# @ref CpuDepthwiseConvolution3x3Kernel if 3x3 and no assembly kernel implementation is present
-    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
-    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
-    * -# @ref NEActivationLayer if fused activation is required
-    *
-    */
-    class CpuDepthwiseConvolutionOptimizedInternal : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConvolutionOptimizedInternal();
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionOptimizedInternal(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConvolutionOptimizedInternal(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionOptimizedInternal &operator=(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConvolutionOptimizedInternal &operator=(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConvolutionOptimizedInternal() = default;
-        /** Initialize the function's source, destination, kernels and border_size.
-         *
-         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution3x3
-         *
-         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in] info    Depthwise convolution meta-data.
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-        // Inherited methods overriden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<CpuDepthwiseConvolutionAssemblyDispatch> _dwc_optimized_func{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
-        bool                                                     _has_bias{ false };
-        bool                                                     _is_quantized{ false };
-        bool                                                     _is_nchw{ true };
-        bool                                                     _permute{ false };
-        bool                                                     _is_activationlayer_enabled{ false };
-        bool                                                     _is_prepared{ false };
-    };
-
-    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
-     *
-     * -# @ref CpuDepthwiseConvolutionNativeKernel
-     *
-     */
-    class CpuDepthwiseConvolutionGeneric : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConvolutionGeneric();
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionGeneric(const CpuDepthwiseConvolutionGeneric &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConvolutionGeneric(CpuDepthwiseConvolutionGeneric &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConvolutionGeneric &operator=(const CpuDepthwiseConvolutionGeneric &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConvolutionGeneric &operator=(CpuDepthwiseConvolutionGeneric &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConvolutionGeneric() = default;
-        /** Initialize the function's source, destination, weights and convolution information.
-         *
-         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionGeneric
-         *
-         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
-         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] info    Depthwise convolution meta-data.
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
-
-        // Inherited methods overridden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<kernels::CpuDepthwiseConvolutionNativeKernel> _depthwise_conv_kernel{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                                   _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                                _activationlayer_function{ nullptr };
-        bool                                                          _is_nchw{ true };
-        bool                                                          _is_prepared{ false };
-        bool                                                          _is_activationlayer_enabled{ false };
-    };
-
-    DepthwiseConvolutionFunction             _depth_conv_func;
-    CpuDepthwiseConvolutionOptimizedInternal _func_optimized;
-    CpuDepthwiseConvolutionGeneric           _func_generic;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZATION_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
deleted file mode 100644
index 039714abb1..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
-#include "src/core/helpers/AutoConfiguration.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
-                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
-                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
-                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                                  neon_convolution_kernels::ActivationFunction activation,
-                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
-                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *input,
-                                                                   const ITensorInfo     *weights,
-                                                                   ITensorInfo           *output,
-                                                                   const ConvolutionInfo &info)
-{
-    const DataType    data_type = input->data_type();
-    const TensorShape shape     = input->tensor_shape();
-
-    const int n_batches       = shape[3];
-    const int in_rows         = shape.z();
-    const int in_cols         = shape.y();
-    const int n_channels      = shape.x();
-    const int dilation_factor = info.dilation.x();
-    const int padding_top     = info.pad_stride_info.pad_top();
-    const int padding_left    = info.pad_stride_info.pad_left();
-    const int padding_bottom  = info.pad_stride_info.pad_bottom();
-    const int padding_right   = info.pad_stride_info.pad_right();
-
-    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
-    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-
-    const unsigned int stride_x    = info.pad_stride_info.stride().first;
-    const unsigned int kernel_size = weights->tensor_shape().y();
-
-    // Map activation function
-    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
-    if(arm_compute::utils::info_helpers::is_relu(info.act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU;
-    }
-    else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
-    }
-
-    // Create quantized convolver
-    if(is_uniform_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-        const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
-        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
-        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int32_t     qmultiplier = 0;
-        int32_t     qshift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
-
-        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
-                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else if(is_perchannel_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-        const QuantizationInfo        weights_qinfo = weights->quantization_info();
-        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
-        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        std::vector<float>   fmultipliers;
-        std::vector<int32_t> qmultipliers;
-        std::vector<int32_t> qshifts;
-
-        for(auto const s : wqinfo.scales)
-        {
-            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
-            int32_t     qmultiplier = 0;
-            int32_t     qshift      = 0;
-            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-            fmultipliers.push_back(fmultipler);
-            qmultipliers.push_back(qmultiplier);
-            qshifts.push_back(qshift);
-        }
-
-        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
-
-        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
-                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else
-    {
-        // Create float convolver
-        switch(data_type)
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F32:
-            {
-                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-            default:
-                return nullptr;
-        }
-    }
-}
-} // namespace
-
-struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl
-{
-    std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
-    NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
-    bool                                              is_prepared{ false };
-    experimental::MemoryRequirements                  mem_req{};
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch()
-    : _pImpl(std::make_unique<LocalImpl>())
-{
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch() = default;
-
-void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo     *input,
-                                                        const ITensorInfo     *weights,
-                                                        const ITensorInfo     *bias,
-                                                        ITensorInfo           *output,
-                                                        const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(bias);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionAssemblyDispatch::validate(input,
-                                                                                 weights,
-                                                                                 bias != nullptr ? bias : nullptr,
-                                                                                 output,
-                                                                                 info));
-
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
-    _pImpl->is_prepared = false;
-
-    // Create convolver
-    _pImpl->dwc_assembly_kernel = create_convolver(input, weights, output, info);
-    ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
-
-    // Create assembly kernel wrapper
-    _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
-
-    constexpr size_t alignment = 128;
-
-    // Create workspace
-    const unsigned int num_threads    = NEScheduler::get().num_threads();
-    const size_t       workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
-
-    // Create packing tensor
-    const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
-    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
-
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
-}
-
-experimental::MemoryRequirements CpuDepthwiseConvolutionAssemblyDispatch::workspace() const
-{
-    return _pImpl->mem_req;
-}
-
-Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo     *input,
-                                                         const ITensorInfo     *weights,
-                                                         const ITensorInfo     *bias,
-                                                         const ITensorInfo     *output,
-                                                         const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-    // Validate convolver
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, info));
-
-    // Validate activation
-    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
-    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
-
-    // Check bias
-    if(bias != nullptr)
-    {
-        unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    // Check output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    // The uniform quantization case will only have 1 scale value in the weights quantization info
-    const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-    const QuantizationInfo        weights_qinfo = weights->quantization_info();
-    const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-    for(auto const s : weights_qinfo.scale())
-    {
-        const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
-        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
-    }
-
-    return Status{};
-}
-
-bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo     *input,
-                                                                     const ITensorInfo     *weights,
-                                                                     const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-
-    // Reshape input shape if in NHWC format
-    const DataLayout data_layout = input->data_layout();
-    TensorShape      in_shape{ input->tensor_shape() };
-    if(data_layout == DataLayout::NHWC)
-    {
-        in_shape.set(Window::DimX, input->tensor_shape().y());
-        in_shape.set(Window::DimY, input->tensor_shape().z());
-        in_shape.set(Window::DimZ, input->tensor_shape().x());
-    }
-
-    // Check data type
-    const DataType input_type            = input->data_type();
-    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
-    const DataType weights_type          = weights->data_type();
-    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
-                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
-
-    // Check weighs size
-    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
-    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int     kernel_w               = weights->dimension(width_idx);
-    const unsigned int     kernel_h               = weights->dimension(height_idx);
-    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
-
-    // Check for supported strides
-    const auto &strides           = info.pad_stride_info.stride();
-    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
-    // Check for supported padding
-    const auto    pad_top           = info.pad_stride_info.pad_top();
-    const auto    pad_right         = info.pad_stride_info.pad_right();
-    const auto    pad_bottom        = info.pad_stride_info.pad_bottom();
-    const auto    pad_left          = info.pad_stride_info.pad_left();
-    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
-    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
-    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
-    bool          supported_padding = is_same_padding || is_valid_padding;
-    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-    bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
-
-    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
-    {
-        is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
-    }
-
-    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
-}
-
-void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors)
-{
-    // Prepare assembly kernel
-    prepare(tensors);
-
-    auto src       = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-    auto dst       = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Setup inputs/outputs
-    ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
-    _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
-
-    ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
-    const int   input_element_size = src->info()->element_size();
-    const int   input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
-    const int   input_row_stride   = src->info()->strides_in_bytes().z() / input_element_size;
-    const int   input_col_stride   = src->info()->strides_in_bytes().y() / input_element_size;
-    const void *input_ptr          = src->buffer() + src->info()->offset_first_element_in_bytes();
-    _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
-
-    ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
-    const int output_element_size = dst->info()->element_size();
-    const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
-    const int output_row_stride   = dst->info()->strides_in_bytes().z() / output_element_size;
-    const int output_col_stride   = dst->info()->strides_in_bytes().y() / output_element_size;
-    void     *output_ptr          = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
-
-    // Schedule assembly kernel
-    NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
-}
-
-void CpuDepthwiseConvolutionAssemblyDispatch::prepare(ITensorPack &tensors)
-{
-    if(!_pImpl->is_prepared)
-    {
-        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
-        ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
-
-        // Pack weights and bias
-        const int weights_element_size = weights->info()->element_size();
-        const int weights_row_stride   = weights->info()->strides_in_bytes().z() / weights_element_size;
-        const int weights_col_stride   = weights->info()->strides_in_bytes().y() / weights_element_size;
-        _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
-                                                 weights->buffer() + weights->info()->offset_first_element_in_bytes(),
-                                                 weights_row_stride,
-                                                 weights_col_stride,
-                                                 (bias != nullptr) ? bias->buffer() : nullptr);
-        _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
-
-        weights->mark_as_unused();
-        if(bias != nullptr)
-        {
-            bias->mark_as_unused();
-        }
-        _pImpl->is_prepared = true;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
deleted file mode 100644
index 6aac74c3ef..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Depthwise convolution assembly kernel glue */
-class CpuDepthwiseConvolutionAssemblyDispatch : public ICpuOperator
-{
-public:
-    CpuDepthwiseConvolutionAssemblyDispatch();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuDepthwiseConvolutionAssemblyDispatch(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    CpuDepthwiseConvolutionAssemblyDispatch(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuDepthwiseConvolutionAssemblyDispatch &operator=(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move assignment operator */
-    CpuDepthwiseConvolutionAssemblyDispatch &operator=(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Default destructor */
-    ~CpuDepthwiseConvolutionAssemblyDispatch();
-    /** Initialize the function's source, destination, kernels and border_size.
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input.
-     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]  info    Depthwise convolution meta-data.
-     */
-    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionAssemblyDispatch
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input.
-     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
-     * @param[in]  info    Depthwise convolution meta-data.
-     *
-     * @return An error status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const ConvolutionInfo &info);
-    /** Check if the optimized kernel can be used for the given kernel sizes and strides
-     *
-     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
-     *
-     * @param[in] input   Input tensor info.
-     * @param[in] weights Weights tensor info.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
-     */
-    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo &info);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    struct LocalImpl;
-    std::unique_ptr<LocalImpl> _pImpl;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
new file mode 100644
index 0000000000..8812b777a3
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuDirectConv2d::~CpuDirectConv2d() = default;
+
+CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
+      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+{
+}
+
+void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>();
+    _conv_kernel          = std::make_unique<kernels::CpuDirectConv2dKernel>();
+    _input_border_handler = std::make_unique<NEFillBorderKernel>();
+
+    // Free accumulator
+    if(_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
+    // Check if bias should be added in the convolution result
+    _has_bias = (bias != nullptr);
+
+    _conv_kernel->configure(src, weights, dst, conv_info);
+    if(_has_bias)
+    {
+        _output_stage_kernel->configure(dst, bias);
+    }
+    _is_padding_required = !_conv_kernel->border_size().empty();
+
+    if(_is_padding_required)
+    {
+        // Add zero padding XY
+        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+    }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<CpuActivation>();
+        _activationlayer_function->configure(dst, dst, act_info);
+    }
+}
+
+Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+
+    // output might not be initialized since it can be an intermediate tensor of another layer
+    DataType   data_type = src->data_type();
+    TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+    // Validate Convolution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of input feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // Validate bias kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
+
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDirectConv2d::run(ITensorPack &tensors)
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    if(_is_padding_required)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_DST, src);
+        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+    }
+    NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
+    if(_has_bias)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, dst);
+        pack.add_tensor(TensorType::ACL_SRC_1, bias);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
+    }
+
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.h b/src/runtime/cpu/operators/CpuDirectConv2d.h
new file mode 100644
index 0000000000..9e584b9c49
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDirectConv2d.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref kernels::CpuDirectConv2dOutputStageKernel
+ * -# @ref kernels::CpuDirectConv2dKernel
+ */
+class CpuDirectConv2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Destructor */
+    ~CpuDirectConv2d();
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+     *
+     * @param[in, out] src       Input tensor info. Data types supported: F16/F32.
+     * @param[in]      weights   Set of kernels to convolve the input volume.
+     *                           Supported sizes: 1x1, 3x3 and 5x5.
+     *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                           Data type supported: Same as @p src.
+     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
+     * @param[out]     dst       Output tensor info.
+     *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    MemoryGroup                                                _memory_group;
+    std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel;
+    std::unique_ptr<kernels::CpuDirectConv2dKernel>            _conv_kernel;
+    std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
+    std::unique_ptr<CpuActivation>                             _activationlayer_function;
+    Tensor                                                     _accumulator;
+    bool                                                       _has_bias{ false };
+    bool                                                       _is_activationlayer_enabled{ false };
+    unsigned int                                               _dim_split{ 0 };
+    bool                                                       _is_padding_required{ false };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.cpp b/src/runtime/cpu/operators/CpuDirectConvolution.cpp
deleted file mode 100644
index 33f79603e8..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConvolution.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuDirectConvolution::~CpuDirectConvolution() = default;
-
-CpuDirectConvolution::CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
-{
-}
-
-void CpuDirectConvolution::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConvolutionOutputStageKernel>();
-    _conv_kernel          = std::make_unique<kernels::CpuDirectConvolutionKernel>();
-    _input_border_handler = std::make_unique<NEFillBorderKernel>();
-
-    // Free accumulator
-    if(_accumulator.buffer() != nullptr)
-    {
-        _accumulator.allocator()->free();
-    }
-
-    _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
-
-    // Check if bias should be added in the convolution result
-    _has_bias = (bias != nullptr);
-
-    _conv_kernel->configure(src, weights, dst, conv_info);
-    if(_has_bias)
-    {
-        _output_stage_kernel->configure(dst, bias);
-    }
-    _is_padding_required = !_conv_kernel->border_size().empty();
-
-    if(_is_padding_required)
-    {
-        // Add zero padding XY
-        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<CpuActivation>();
-        _activationlayer_function->configure(dst, dst, act_info);
-    }
-}
-
-Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // output might not be initialized since it can be an intermediate tensor of another layer
-    DataType   data_type = src->data_type();
-    TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
-
-    // Validate Convolution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionKernel::validate(src, weights, &accumulator, conv_info));
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of input feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
-    }
-
-    // Validate bias kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionOutputStageKernel::validate(&accumulator, bias, dst));
-
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
-    }
-
-    return Status{};
-}
-
-void CpuDirectConvolution::run(ITensorPack &tensors)
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_is_padding_required)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_DST, src);
-        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
-    }
-    NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
-    if(_has_bias)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, dst);
-        pack.add_tensor(TensorType::ACL_SRC_1, bias);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.h b/src/runtime/cpu/operators/CpuDirectConvolution.h
deleted file mode 100644
index 0635e087fd..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConvolution.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to run the direct convolution.
- *
- *  This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel for the input
- * -# @ref kernels::CpuDirectConvolutionOutputStageKernel
- * -# @ref kernels::CpuDirectConvolutionKernel
- */
-class CpuDirectConvolution : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Destructor */
-    ~CpuDirectConvolution();
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
-     *
-     * @param[in, out] src       Input tensor info. Data types supported: F16/F32.
-     * @param[in]      weights   Set of kernels to convolve the input volume.
-     *                           Supported sizes: 1x1, 3x3 and 5x5.
-     *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                           Data type supported: Same as @p src.
-     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
-     * @param[out]     dst       Output tensor info.
-     *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
-     *
-     * @param[in] src       Input tensor info. Data types supported: F16/F32.
-     * @param[in] weights   Set of kernels to convolve the input volume.
-     *                      Supported sizes: 1x1, 3x3 and 5x5.
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported: Same as @p src.
-     * @param[in] bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
-     * @param[in] dst       Output tensor info.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    MemoryGroup                                                     _memory_group;
-    std::unique_ptr<kernels::CpuDirectConvolutionOutputStageKernel> _output_stage_kernel;
-    std::unique_ptr<kernels::CpuDirectConvolutionKernel>            _conv_kernel;
-    std::unique_ptr<NEFillBorderKernel>                             _input_border_handler;
-    std::unique_ptr<CpuActivation>                                  _activationlayer_function;
-    Tensor                                                          _accumulator;
-    bool                                                            _has_bias{ false };
-    bool                                                            _is_activationlayer_enabled{ false };
-    unsigned int                                                    _dim_split{ 0 };
-    bool                                                            _is_padding_required{ false };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H */
diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp
new file mode 100644
index 0000000000..b225199c40
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPool2d.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuPool2d.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/cpu/kernels/CpuPool2dKernel.h"
+#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPool2d::CpuPool2d()
+    : _pooling_layer_kernel(),
+      _border_handler(),
+      _asm_glue(),
+      _is_global_pooling_layer(false),
+      _data_layout(DataLayout::NCHW),
+      _mem_req()
+{
+}
+
+CpuPool2d::~CpuPool2d() = default;
+
+void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
+    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+    // Get data layout
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+
+    // Check if we have Global Pooling Layer
+    const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
+
+    if(run_optimised)
+    {
+        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+        const unsigned int num_threads = NEScheduler::get().num_threads();
+
+        auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
+        ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
+        pooling_wrapper->configure(src, dst, pool_info, ci);
+
+        // Get kernel's memory requirements
+        constexpr size_t alignment      = 4096;
+        const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
+        _mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
+
+        _asm_glue = std::move(pooling_wrapper);
+    }
+    else
+    {
+        // Configure pooling kernel
+        auto k = std::make_unique<kernels::CpuPool2dKernel>();
+        k->configure(src, dst, pool_info, indices);
+        _pooling_layer_kernel = std::move(k);
+
+        switch(_data_layout)
+        {
+            case DataLayout::NCHW:
+            {
+                // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+                BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+                PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
+                if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding)
+                {
+                    zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+                }
+                auto b = std::make_unique<NEFillBorderKernel>();
+                b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value);
+                _border_handler = std::move(b);
+                break;
+            }
+            case DataLayout::NHWC:
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data layout not supported");
+        }
+    }
+}
+
+Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+    if(run_optimised)
+    {
+        return Status{};
+    }
+
+    return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices);
+}
+
+void CpuPool2d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
+
+    if(_asm_glue)
+    {
+        const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
+        NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
+    }
+    else
+    {
+        switch(_data_layout)
+        {
+            case DataLayout::NCHW:
+                // Fill border
+                NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
+
+                // Run pooling layer
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+                break;
+            case DataLayout::NHWC:
+                // Run pooling layer
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data layout not supported");
+        }
+    }
+}
+
+experimental::MemoryRequirements CpuPool2d::workspace() const
+{
+    return _mem_req;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h
new file mode 100644
index 0000000000..ae3d115dfc
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPool2d.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_H
+#define ARM_COMPUTE_CPU_POOL2D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward Declarations
+struct PoolingLayerInfo;
+
+namespace cpu
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref kernels::CpuPool2dKernel
+ * -# @ref kernels::CpuPool2dAssemblyWrapperKernel
+ */
+class CpuPool2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuPool2d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d);
+    /** Default destructor */
+    ~CpuPool2d();
+    /** Set the src and dst tensors.
+     *
+     * @note F16 is supported for pool sizes 2 and 3 only
+     *
+     * @param[in, out] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]     dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    std::unique_ptr<INEKernel> _pooling_layer_kernel;
+    std::unique_ptr<INEKernel> _border_handler;
+    std::unique_ptr<INEKernel> _asm_glue;
+
+    bool                             _is_global_pooling_layer;
+    DataLayout                       _data_layout;
+    experimental::MemoryRequirements _mem_req;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL2D_H */
diff --git a/src/runtime/cpu/operators/CpuPooling.cpp b/src/runtime/cpu/operators/CpuPooling.cpp
deleted file mode 100644
index 3a6ac24a74..0000000000
--- a/src/runtime/cpu/operators/CpuPooling.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuPooling.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
-#include "src/core/cpu/kernels/CpuPoolingKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuPooling::CpuPooling()
-    : _pooling_layer_kernel(),
-      _border_handler(),
-      _asm_glue(),
-      _is_global_pooling_layer(false),
-      _data_layout(DataLayout::NCHW),
-      _mem_req()
-{
-}
-
-CpuPooling::~CpuPooling() = default;
-
-void CpuPooling::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
-    // Get data layout
-    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-
-    // Check if we have Global Pooling Layer
-    const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
-
-    if(run_optimised)
-    {
-        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-        const unsigned int num_threads = NEScheduler::get().num_threads();
-
-        auto pooling_wrapper = std::make_unique<kernels::CpuPoolingAssemblyWrapperKernel>();
-        ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
-        pooling_wrapper->configure(src, dst, pool_info, ci);
-
-        // Get kernel's memory requirements
-        constexpr size_t alignment      = 4096;
-        const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
-        _mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
-
-        _asm_glue = std::move(pooling_wrapper);
-    }
-    else
-    {
-        // Configure pooling kernel
-        auto k = std::make_unique<kernels::CpuPoolingKernel>();
-        k->configure(src, dst, pool_info, indices);
-        _pooling_layer_kernel = std::move(k);
-
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-            {
-                // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-                BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-                PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
-                if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding)
-                {
-                    zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-                }
-                auto b = std::make_unique<NEFillBorderKernel>();
-                b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value);
-                _border_handler = std::move(b);
-                break;
-            }
-            case DataLayout::NHWC:
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
-}
-
-Status CpuPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
-    if(run_optimised)
-    {
-        return Status{};
-    }
-
-    return kernels::CpuPoolingKernel::validate(src, dst, pool_info, indices);
-}
-
-void CpuPooling::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
-
-    if(_asm_glue)
-    {
-        const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
-        NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
-    }
-    else
-    {
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-                // Fill border
-                NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
-
-                // Run pooling layer
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
-                break;
-            case DataLayout::NHWC:
-                // Run pooling layer
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
-}
-
-experimental::MemoryRequirements CpuPooling::workspace() const
-{
-    return _mem_req;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPooling.h b/src/runtime/cpu/operators/CpuPooling.h
deleted file mode 100644
index bc30adf762..0000000000
--- a/src/runtime/cpu/operators/CpuPooling.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOLING_H
-#define ARM_COMPUTE_CPU_POOLING_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward Declarations
-struct PoolingLayerInfo;
-
-namespace cpu
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref kernels::CpuPoolingKernel
- * -# @ref kernels::CpuPoolingAssemblyWrapperKernel
- */
-class CpuPooling : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuPooling();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuPooling(const CpuPooling &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuPooling &operator=(const CpuPooling &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuPooling(CpuPooling &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuPooling &operator=(CpuPooling &&) = delete;
-    /** Default destructor */
-    ~CpuPooling();
-    /** Set the src and dst tensors.
-     *
-     * @note F16 is supported for pool sizes 2 and 3 only
-     *
-     * @param[in, out] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out]     dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPooling
-     *
-     * @note F16 is supported for pool sizes 2 and 3 only
-     *
-     * @param[in] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    std::unique_ptr<INEKernel> _pooling_layer_kernel;
-    std::unique_ptr<INEKernel> _border_handler;
-    std::unique_ptr<INEKernel> _asm_glue;
-
-    bool                             _is_global_pooling_layer;
-    DataLayout                       _data_layout;
-    experimental::MemoryRequirements _mem_req;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOLING_H */
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
new file mode 100644
index 0000000000..527b3a65f9
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+ITensorPack select_activation_src_dst(ITensorPack &tensors)
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
+    pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
+    return pack;
+}
+} // namespace
+
+void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+    // Configure direct convolution kernel
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, weights, biases, dst, conv_info);
+    _direct_conv_kernel = std::move(k);
+
+    // Configure border handler
+    PixelValue zero_value(0.f);
+    if(is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+    }
+    auto b = std::make_unique<CLFillBorderKernel>();
+    b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _src_border_handler = std::move(b);
+
+    if(act_info.enabled())
+    {
+        auto a = std::make_unique<kernels::ClActivationKernel>();
+        a->configure(compile_context, dst, dst, act_info);
+        _activation_kernel = std::move(a);
+    }
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
+}
+
+Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target()));
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
+    }
+    return Status{};
+}
+
+void ClDirectConv2d::run(ITensorPack &tensors)
+{
+    // Run border handler
+    CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
+    // Run direct convolution
+    CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
+    // Run activation kernel
+    if(_activation_kernel)
+    {
+        auto act_pack = select_activation_src_dst(tensors);
+        CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.h b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
new file mode 100644
index 0000000000..e069733fab
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClDirectConv2d
+ */
+class ClDirectConv2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClDirectConv2d() = default;
+    /** Set the src and dst tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of srcs.
+     *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
+    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp b/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp
deleted file mode 100644
index 3382a6c3c5..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace
-{
-ITensorPack select_activation_src_dst(ITensorPack &tensors)
-{
-    ITensorPack pack;
-    pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
-    pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
-    return pack;
-}
-} // namespace
-
-void ClDirectConvolution::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                    const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    // Configure direct convolution kernel
-    auto k = std::make_unique<kernels::ClDirectConvolutionKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, src, weights, biases, dst, conv_info);
-    _direct_conv_kernel = std::move(k);
-
-    // Configure border handler
-    PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(src->data_type()))
-    {
-        zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-    }
-    auto b = std::make_unique<CLFillBorderKernel>();
-    b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
-    _src_border_handler = std::move(b);
-
-    if(act_info.enabled())
-    {
-        auto a = std::make_unique<kernels::ClActivationKernel>();
-        a->configure(compile_context, dst, dst, act_info);
-        _activation_kernel = std::move(a);
-    }
-
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
-}
-
-Status ClDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConvolutionKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target()));
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
-    }
-    return Status{};
-}
-
-void ClDirectConvolution::run(ITensorPack &tensors)
-{
-    // Run border handler
-    CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
-    // Run direct convolution
-    CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
-    // Run activation kernel
-    if(_activation_kernel)
-    {
-        auto act_pack = select_activation_src_dst(tensors);
-        CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
-    }
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.h b/src/runtime/gpu/cl/operators/ClDirectConvolution.h
deleted file mode 100644
index e7ad927b0b..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConvolution.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H
-#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClDirectConvolution
- */
-class ClDirectConvolution : public IClOperator
-{
-public:
-    /** Constructor */
-    ClDirectConvolution() = default;
-    /** Set the src and dst tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of srcs.
-     *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
-     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                             Data types supported: Same as @p src.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolution
-     *
-     * @param[in] src       Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of srcs.
-     *                      Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
-     * @param[in] biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                      Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst       Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                      Data types supported: Same as @p src.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
-    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H */
\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.cpp b/src/runtime/gpu/cl/operators/ClPool2d.cpp
new file mode 100644
index 0000000000..40c2b0a8ba
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClPool2d.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClPool2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::ClPool2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info, indices);
+    _pooling = std::move(k);
+
+    const DataType data_type = src->data_type();
+
+    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+    BorderMode border_mode{};
+    PixelValue pixel_value(0.f);
+    if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
+    {
+        pixel_value = PixelValue(0, data_type, src->quantization_info());
+    }
+
+    // Data layout
+    const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+            border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+            break;
+        case DataLayout::NHWC:
+            border_mode = BorderMode::CONSTANT;
+            if(PoolingType::MAX == info.pool_type)
+            {
+                if(is_data_type_quantized(data_type))
+                {
+                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
+                }
+                else
+                {
+                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+                }
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
+    auto b = std::make_unique<CLFillBorderKernel>();
+    b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
+    _border_handler = std::move(b);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_pooling);
+}
+
+Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+{
+    return kernels::ClPool2dKernel::validate(src, dst, info, indices);
+}
+
+void ClPool2d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
+    CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.h b/src/runtime/gpu/cl/operators/ClPool2d.h
new file mode 100644
index 0000000000..8ac386a64b
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClPool2d.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_H
+#define ARM_COMPUTE_CL_POOL2D_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClPool2d
+ */
+class ClPool2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClPool2d() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]  info            Pooling layer parameters.
+     * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
+     */
+    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<ICLKernel> _pooling{ nullptr };
+    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/src/runtime/gpu/cl/operators/ClPooling.cpp b/src/runtime/gpu/cl/operators/ClPooling.cpp
deleted file mode 100644
index 8610eb9842..0000000000
--- a/src/runtime/gpu/cl/operators/ClPooling.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPooling.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClPoolingKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClPooling::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    // Configure pooling kernel
-    auto k = std::make_unique<kernels::ClPoolingKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, src, dst, info, indices);
-    _pooling = std::move(k);
-
-    const DataType data_type = src->data_type();
-
-    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode{};
-    PixelValue pixel_value(0.f);
-    if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
-    {
-        pixel_value = PixelValue(0, data_type, src->quantization_info());
-    }
-
-    // Data layout
-    const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-            border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-            break;
-        case DataLayout::NHWC:
-            border_mode = BorderMode::CONSTANT;
-            if(PoolingType::MAX == info.pool_type)
-            {
-                if(is_data_type_quantized(data_type))
-                {
-                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
-                }
-                else
-                {
-                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
-                }
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-    auto b = std::make_unique<CLFillBorderKernel>();
-    b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
-    _border_handler = std::move(b);
-
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_pooling);
-}
-
-Status ClPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
-{
-    return kernels::ClPoolingKernel::validate(src, dst, info, indices);
-}
-
-void ClPooling::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
-    CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClPooling.h b/src/runtime/gpu/cl/operators/ClPooling.h
deleted file mode 100644
index 99de6d0dcf..0000000000
--- a/src/runtime/gpu/cl/operators/ClPooling.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_POOLING_H
-#define ARM_COMPUTE_CL_POOLING_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClPooling
- */
-class ClPooling : public IClOperator
-{
-public:
-    /** Constructor */
-    ClPooling() = default;
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]  info            Pooling layer parameters.
-     * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClPooling
-     *
-     * @param[in]  src     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst     Destination tensor info. Data type supported: same as @p src
-     * @param[in]  info    Pooling layer parameters.
-     * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<ICLKernel> _pooling{ nullptr };
-    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_POOLING_H */
-- 
cgit v1.2.1