From 60c3b0e6821a80d78ffca5be30e05d062d071cd2 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 8 Apr 2021 12:02:58 +0100
Subject: Port DepthwiseConvolution to new API

Resolves: COMPMID-4185

Change-Id: Ib5f22356356a022d567bb18d44ea272b62d10ebf
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5424
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |   5 +-
 arm_compute/core/Types.h                           |  13 +
 arm_compute/core/utils/misc/ShapeCalculator.h      |  15 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   |  64 +-
 .../NEDepthwiseConvolutionAssemblyDispatch.h       | 123 ---
 docs/00_introduction.dox                           |   6 +-
 docs/06_functions_list.dox                         |   1 -
 scripts/clang_tidy_rules.py                        |   9 +-
 .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp   |   9 +-
 .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp   |   5 +-
 .../CLDepthwiseConvolutionLayerNativeKernel.cpp    |   8 +-
 src/core/NEON/NEKernels.h                          |   1 -
 .../NEDepthwiseConvolutionLayerNativeKernel.cpp    | 911 --------------------
 .../NEDepthwiseConvolutionLayerNativeKernel.h      | 131 ---
 src/core/cpu/kernels/CpuActivationKernel.cpp       |   2 +-
 .../CpuDepthwiseConvolutionNativeKernel.cpp        | 918 +++++++++++++++++++++
 .../kernels/CpuDepthwiseConvolutionNativeKernel.h  | 117 +++
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   |  14 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 409 ++++-----
 .../NEDepthwiseConvolutionAssemblyDispatch.cpp     | 569 -------------
 .../cpu/operators/CpuDepthwiseConvolution.cpp      | 521 ++++++++++++
 .../cpu/operators/CpuDepthwiseConvolution.h        | 230 ++++++
 .../CpuDepthwiseConvolutionAssemblyDispatch.cpp    | 564 +++++++++++++
 .../CpuDepthwiseConvolutionAssemblyDispatch.h      |  97 +++
 tests/NEON/Helper.h                                |  15 +-
 .../NEON/DepthwiseConvolutionLayerNative.cpp       |  17 +-
 .../fixtures/DepthwiseConvolutionLayerFixture.h    |  33 +-
 27 files changed, 2731 insertions(+), 2076 deletions(-)
 delete mode 100644 arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
 delete mode 100644 src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
 create mode 100644 src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
 create mode 100644 src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
 delete mode 100644 src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.h
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h

diff --git a/Android.bp b/Android.bp
index 3736ef754f..d968f029a2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -176,7 +176,6 @@ cc_library_static {
         "src/core/NEON/kernels/NECropKernel.cpp",
         "src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp",
         "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
-        "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp",
         "src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp",
         "src/core/NEON/kernels/NEFFTRadixStageKernel.cpp",
         "src/core/NEON/kernels/NEFFTScaleKernel.cpp",
@@ -299,6 +298,7 @@ cc_library_static {
         "src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp",
         "src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
         "src/core/cpu/kernels/CpuCopyKernel.cpp",
+        "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp",
         "src/core/cpu/kernels/CpuDequantizationKernel.cpp",
         "src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp",
         "src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp",
@@ -612,7 +612,6 @@ cc_library_static {
         "src/runtime/NEON/functions/NETranspose.cpp",
         "src/runtime/NEON/functions/NEUnstack.cpp",
         "src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
-        "src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp",
         "src/runtime/OMP/OMPScheduler.cpp",
         "src/runtime/OffsetLifetimeManager.cpp",
         "src/runtime/OffsetMemoryPool.cpp",
@@ -630,6 +629,8 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuAdd.cpp",
         "src/runtime/cpu/operators/CpuConcatenate.cpp",
         "src/runtime/cpu/operators/CpuCopy.cpp",
+        "src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp",
+        "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp",
         "src/runtime/cpu/operators/CpuDequantization.cpp",
         "src/runtime/cpu/operators/CpuDirectConvolution.cpp",
         "src/runtime/cpu/operators/CpuElementwise.cpp",
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 53333ff608..b1f340d18e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1861,6 +1861,19 @@ private:
     bool _broadcast_bias;
 };
 
+struct ConvolutionInfo
+{
+    ConvolutionInfo() = default;
+    ConvolutionInfo(const PadStrideInfo &pad_stride_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+        : pad_stride_info(pad_stride_info), depth_multiplier(depth_multiplier), act_info(act_info), dilation(dilation)
+    {
+    }
+    PadStrideInfo       pad_stride_info{};        /**< Convolution info (Pads, strides,...) */
+    unsigned int        depth_multiplier{ 1 };    /**< Multiplier to apply to input's depth to retrieve the output depth. Defaults to 1 */
+    ActivationLayerInfo act_info{};               /**< Fused activation to apply after convolution. */
+    Size2D              dilation{ Size2D(1, 1) }; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */
+};
+
 struct DepthwiseConvolutionReshapeInfo
 {
     unsigned int c0{ 1 };            /**< Number of channels processed by the depth-wise convolution */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 56038dd853..ba37f9a61e 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -435,16 +435,13 @@ inline TensorShape compute_transposed_shape(const ITensorInfo &input)
 
 /** Calculate the depthwise convolution output shape of a tensor
  *
- * @param[in] input            Input tensor info
- * @param[in] weights          Weights tensor info
- * @param[in] conv_info        Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth.
- * @param[in] dilation         Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] input   Input tensor info
+ * @param[in] weights Weights tensor info
+ * @param[in] info    Convolution info
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation = Size2D(1U,
-                                                       1U))
+inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
 {
     const TensorShape input_shape{ input.tensor_shape() };
     const TensorShape weights_shape{ weights.tensor_shape() };
@@ -462,12 +459,12 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     unsigned int output_height = 0;
     std::tie(output_width, output_height) = scaled_dimensions(input_shape[width_idx], input_shape[height_idx],
                                                               weights_shape[weights_width_idx], weights_shape[weights_height_idx],
-                                                              conv_info, dilation);
+                                                              info.pad_stride_info, info.dilation);
 
     TensorShape output_shape{ input_shape };
     output_shape.set(width_idx, output_width);
     output_shape.set(height_idx, output_height);
-    output_shape.set(channel_idx, input_shape[channel_idx] * depth_multiplier);
+    output_shape.set(channel_idx, input_shape[channel_idx] * info.depth_multiplier);
 
     return output_shape;
 }
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 9aa8f04eb8..c74b2a93ee 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H
 #define ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H
 
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
-#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
 #include <memory>
 
 namespace arm_compute
@@ -91,25 +92,6 @@ public:
     void prepare() override;
 
 private:
-    /** Static function to choose the best depthwise convolution function for @ref NEDepthwiseConvolutionLayer
-     *
-     * @param[in] input            Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 quantized are supported.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a Depthwise Convolution Function
-     */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                                                                          ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
     /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
     *
     * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
@@ -173,23 +155,9 @@ private:
         void prepare() override;
 
     private:
-        MemoryGroup                            _memory_group;
-        NEDepthwiseConvolutionAssemblyDispatch _dwc_optimized_func;
-        NEPermute                              _permute_input;
-        NEPermute                              _permute_weights;
-        NEPermute                              _permute_output;
-        NEActivationLayer                      _activationlayer_function;
-        Tensor                                 _accumulator;
-        Tensor                                 _permuted_input;
-        Tensor                                 _permuted_weights;
-        Tensor                                 _permuted_output;
-        const ITensor                         *_original_weights;
-        bool                                   _has_bias;
-        bool                                   _is_quantized;
-        bool                                   _is_nchw;
-        bool                                   _permute;
-        bool                                   _is_activationlayer_enabled;
-        bool                                   _is_prepared;
+        MemoryGroup _memory_group;
+        struct Impl;
+        std::unique_ptr<Impl> _impl;
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
@@ -248,26 +216,14 @@ private:
 
         // Inherited methods overriden:
         void run() override;
-        void prepare() override;
 
     private:
-        std::unique_ptr<NEDepthwiseConvolutionLayerNativeKernel> _depthwise_conv_kernel;
-        NEPermute                                                _permute_input;
-        NEPermute                                                _permute_weights;
-        NEPermute                                                _permute_output;
-        NEActivationLayer                                        _activationlayer_function;
-        Tensor                                                   _permuted_input;
-        Tensor                                                   _permuted_weights;
-        Tensor                                                   _permuted_output;
-        bool                                                     _is_prepared;
-        bool                                                     _is_nchw;
-        bool                                                     _is_activationlayer_enabled;
-        const ITensor                                           *_original_weights;
+        struct Impl;
+        std::unique_ptr<Impl> _impl;
     };
-
-    DepthwiseConvolutionFunction                 _depth_conv_func;
-    NEDepthwiseConvolutionLayerOptimizedInternal _func_optimized;
-    NEDepthwiseConvolutionLayerGeneric           _func_generic;
+    MemoryGroup _memory_group;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H */
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
deleted file mode 100644
index 7f63717b02..0000000000
--- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-namespace arm_compute
-{
-/** Depthwise convolution assembly kernel glue */
-class NEDepthwiseConvolutionAssemblyDispatch : public IFunction
-{
-public:
-    /** Default constructor
-     *
-     * @param[in,out] memory_manager Memory manager to use
-     */
-    NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyDispatch(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    NEDepthwiseConvolutionAssemblyDispatch(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Default destructor */
-    ~NEDepthwiseConvolutionAssemblyDispatch();
-    /** Initialize the function's source, destination, kernels and border_size.
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
-                   const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                   const Size2D &dilation = Size2D(1, 1));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return An error status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output,
-                           const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           const Size2D &dilation = Size2D(1, 1));
-    /** Check if the optimized kernel can be used for the given kernel sizes and strides
-     *
-     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
-     *
-     * @param[in] input            Input tensor info.
-     * @param[in] weights          Weights tensor info.
-     * @param[in] conv_info        Convolution layer metadata.
-     * @param[in] depth_multiplier (Optional) Depth multiplier to be used.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
-     */
-    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1, 1));
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    struct LocalImpl;
-
-private:
-    MemoryGroup                _memory_group;
-    const ITensor             *_input;
-    const ITensor             *_weights;
-    const ITensor             *_bias;
-    ITensor                   *_output;
-    Tensor                     _packed_weights;
-    Tensor                     _workspace;
-    bool                       _is_prepared;
-    std::unique_ptr<LocalImpl> _pImpl;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index efc2963f6e..cfb27ff8b2 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -266,7 +266,7 @@ v20.11 Public major release
    - @ref NEGEMMTranspose1xWKernel
    - NEPoolingLayerKernel
    - NEConvolutionKernel
-   - @ref NEDepthwiseConvolutionLayerNativeKernel
+   - NEDepthwiseConvolutionLayerNativeKernel
    - @ref NEGEMMLowpMatrixMultiplyKernel
    - @ref NEGEMMMatrixMultiplyKernel
    - NEDirectConvolutionLayerOutputStageKernel
@@ -818,7 +818,7 @@ v19.08 Public major release
     - @ref NESinLayer
     - NEBatchConcatenateLayerKernel
     - @ref NEDepthToSpaceLayerKernel / @ref NEDepthToSpaceLayer
-    - @ref NEDepthwiseConvolutionLayerNativeKernel
+    - NEDepthwiseConvolutionLayerNativeKernel
     - @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
     - @ref NEMeanStdDevNormalizationKernel / @ref NEMeanStdDevNormalizationLayer
     - @ref NESpaceToDepthLayerKernel / @ref NESpaceToDepthLayer
@@ -863,7 +863,7 @@ v19.05 Public major release
     - @ref NEBatchToSpaceLayerKernel / @ref NEBatchToSpaceLayer
     - NEComplexPixelWiseMultiplicationKernel / @ref NEComplexPixelWiseMultiplication
     - @ref NECropKernel / @ref NECropResize
-    - @ref NEDepthwiseConvolutionAssemblyDispatch
+    - NEDepthwiseConvolutionAssemblyDispatch
     - @ref NEFFTDigitReverseKernel
     - @ref NEFFTRadixStageKernel
     - @ref NEFFTScaleKernel
diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index de43a0cee7..bd044203f2 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox
@@ -88,7 +88,6 @@ namespace arm_compute
     - @ref NEConvolutionLayerReshapeWeights
     - @ref NECropResize
     - @ref NEDeconvolutionLayer
-    - @ref NEDepthwiseConvolutionAssemblyDispatch
     - @ref NEDepthwiseConvolutionLayer
     - @ref NEDequantizationLayer
     - @ref NEDetectionPostProcessLayer
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index fd6fd9412e..c38e3ddde4 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -111,14 +111,15 @@ def filter_clang_tidy_lines( lines ):
                ("NEWinogradLayerKernel.cpp" in line and "use '= default' to define a trivial destructor" in line) or
                ("NEGEMMLowpMatrixMultiplyCore.cpp" in line and "constructor does not initialize these fields" in line) or
                ("NEGEMMLowpAssemblyMatrixMultiplyCore" in line and "constructor does not initialize these fields" in line) or
-               ("NEDepthwiseConvolutionLayerNativeKernel" in line and re.search(r"parameter '[^']+' is unused", line)) or
-               ("NEDepthwiseConvolutionAssemblyDispatch" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("CpuDepthwiseConvolutionNativeKernel" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("CpuDepthwiseConvolutionAssemblyDispatch" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("CpuDepthwiseConvolutionAssemblyDispatch" in line and "modernize-use-equals-default" in line) or
                ("CPUUtils.cpp" in line and "consider replacing 'unsigned long' with 'uint64'" in line) or
                ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or
                ("CPUUtils.cpp" in line and "warning: uninitialized record type" in line) or
                ("Utils.h" in line and "warning: Use of zero-allocated memory" in line) or
-               ("NEDepthwiseConvolutionLayerNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line) or # This is to prevent false positive, should be reassessed with the newer clang-tidy
-               ("NEDepthwiseConvolutionLayerNativeKernel.cpp" in line and "cppcoreguidelines-pro-type-member-init" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
+               ("CpuDepthwiseConvolutionNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line) or # This is to prevent false positive, should be reassessed with the newer clang-tidy
+               ("CpuDepthwiseConvolutionNativeKernel.cpp" in line and "cppcoreguidelines-pro-type-member-init" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
                 print_context=False
                 continue
 
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index e8daa56f36..287a965f5b 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -105,7 +105,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+        const TensorShape     output_shape = compute_depthwise_convolution_shape(*input, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
 
@@ -116,7 +117,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                                                         unsigned int depth_multiplier, GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation)
 {
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+    const ConvolutionInfo info
+    {
+        conv_info, depth_multiplier, ActivationLayerInfo(), dilation
+    };
+    const TensorShape     output_shape = compute_depthwise_convolution_shape(*input, *weights, info);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
 
     const unsigned int conv_stride_x = conv_info.stride().first;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 6400ba53e2..f7603e6397 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -65,8 +65,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     const size_t weights_width  = 3;
     const size_t weights_height = 3;
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
-                                         *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
+    const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+                                             *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), info);
     if(is_qasymm)
     {
         DepthwiseConvolutionReshapeInfo info;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index c34018a000..fcfa7f878d 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     ARM_COMPUTE_UNUSED(idx_c);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+    const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
@@ -156,7 +157,8 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
 
     auto padding_info = get_padding_info({ input, output });
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info, depth_multiplier, dilation);
+    const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), info);
     auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
 
     _input              = input;
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 264f521be2..e982470402 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -38,7 +38,6 @@
 #include "src/core/NEON/kernels/NECropKernel.h"
 #include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 #include "src/core/NEON/kernels/NEFFTScaleKernel.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
deleted file mode 100644
index 24fd01fee1..0000000000
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ /dev/null
@@ -1,911 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr auto data_layout = DataLayout::NHWC;
-const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
-constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
-constexpr size_t vector_size          = 8;
-
-struct DepthwiseConvolutionRunInfo
-{
-    const size_t   num_read_elements_per_iteration;
-    const uint32_t x_start;
-    const uint32_t x_end;
-    const uint32_t x_step;
-    const uint32_t x_leftover_start;
-    const size_t   input_stride_y;
-    const size_t   input_stride_z;
-    const size_t   input_max_offset;
-    const size_t   weights_width;
-    const size_t   weights_height;
-    const size_t   weights_stride_y;
-    const size_t   weights_stride_z;
-    const size_t   conv_stride_x;
-    const size_t   conv_stride_y;
-    const size_t   conv_pad_left;
-    const size_t   conv_pad_top;
-    const size_t   input_height;
-    const size_t   input_width;
-    const size_t   input_depth;
-
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
-        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
-          x_start(w.x().start()),
-          x_end(w.x().end()),
-          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
-          x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
-          input_stride_y(input.strides_in_bytes().y()),
-          input_stride_z(input.strides_in_bytes().z()),
-          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
-          weights_width(weights.dimension(width_idx)),
-          weights_height(weights.dimension(height_idx)),
-          weights_stride_y(weights.strides_in_bytes().y()),
-          weights_stride_z(weights.strides_in_bytes().z()),
-          conv_stride_x(conv_info.stride().first),
-          conv_stride_y(conv_info.stride().second),
-          conv_pad_left(conv_info.pad_left()),
-          conv_pad_top(conv_info.pad_top()),
-          input_height(input.dimension(height_idx)),
-          input_width(input.dimension(width_idx)),
-          input_depth(input.dimension(channel_idx))
-    {
-    }
-};
-
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
-{
-    const int32_t current_h  = base_h + h * dilation.y();
-    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
-
-    const int32_t current_w  = base_w + w * dilation.x();
-    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
-
-    return is_valid_h && is_valid_w;
-}
-
-template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                   const Size2D &dilation, const Window &window, bool has_biases)
-{
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
-
-    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto const base_weights_ptr = weights_it.ptr();
-        uint32_t   x                = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            VectorType acc          = zero_vector;
-            auto       weights_ptr  = base_weights_ptr;
-            int64_t    input_offset = base_input_offset;
-
-            for(uint32_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(uint32_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 zero_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc                    = wrapper::vadd(acc, biases_vals);
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
-        }
-
-        for(; x < run_info.x_end; ++x)
-        {
-            auto    acc_scalar   = T{ 0 };
-            auto    weights_ptr  = base_weights_ptr;
-            int64_t input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
-                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc_scalar += (input_vals * weights_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc_scalar += biases_vals;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T>
-void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
-{
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
-
-        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        if(has_biases)
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
-            }
-        }
-        else
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
-{
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-    using AccType                     = int32_t;
-    using AccArrayType                = std::array<AccType, element_per_vector>;
-
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
-    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
-
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-        auto const    base_weights_ptr  = weights_it.ptr();
-        size_t        x                 = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            AccArrayType acc{};
-            AccArrayType in_sum{};
-            AccArrayType we_sum{};
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 out_of_bound_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    for(size_t i = 0; i < element_per_vector; ++i)
-                    {
-                        acc.at(i) += input_vals[i] * weights_vals[i];
-                        in_sum.at(i) += input_vals[i];
-                        we_sum.at(i) += weights_vals[i];
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < element_per_vector; ++i)
-            {
-                acc.at(i) -= in_sum.at(i) * weights_qoffset;
-                acc.at(i) -= we_sum.at(i) * input_qoffset;
-                acc.at(i) += k_offset;
-
-                if(has_biases)
-                {
-                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
-                }
-
-                const int32_t out_mul   = output_multiplier.at(x + i);
-                const int32_t out_shift = output_shift.at(x + i);
-                if(out_shift < 0)
-                {
-                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
-                }
-                else
-                {
-                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
-                }
-                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
-        }
-
-        // left-over
-        for(; x < run_info.x_end; ++x)
-        {
-            AccType acc    = 0;
-            AccType in_sum = 0;
-            AccType we_sum = 0;
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_val       = is_valid_region ?
-                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
-                                                 out_of_bound_value;
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc += input_val * weights_val;
-                    in_sum += input_val;
-                    we_sum += weights_val;
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            T out_vals{ 0 };
-
-            acc -= in_sum * weights_qoffset;
-            acc -= we_sum * input_qoffset;
-            acc += k_offset;
-
-            if(has_biases)
-            {
-                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
-            }
-
-            const int32_t out_mul   = output_multiplier.at(x);
-            const int32_t out_shift = output_shift.at(x);
-
-            if(out_shift < 0)
-            {
-                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
-            }
-
-            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
-{
-    using AccType = int32_t;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
-
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<AccType> acc(depth_multiplier, 0);
-        std::vector<AccType> we_sum(depth_multiplier, 0);
-        AccType              in_sum = 0;
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m) += input_val * weights_val;
-
-                    we_sum.at(m) += weights_val;
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-                in_sum += input_val;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0; m < depth_multiplier; ++m)
-        {
-            acc.at(m) -= in_sum * weights_qoffset;
-            acc.at(m) -= we_sum.at(m) * input_qoffset;
-            acc.at(m) += k_offset;
-
-            if(has_biases)
-            {
-                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-            }
-
-            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
-            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
-            if(out_shift < 0)
-            {
-                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
-{
-    constexpr int half_vec = vector_size / 2;
-
-    using AccType          = int32_t;
-    using AccVectorType    = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
-    using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
-    using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
-    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
-
-    const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
-    const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
-    const auto zero  = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
-
-    const auto out_mul   = output_multiplier.at(0);
-    const auto out_shift = output_shift.at(0);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(input, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
-    std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::fill(begin(acc0), end(acc0), zero);
-        std::fill(begin(acc1), end(acc1), zero);
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            const int32_t current_h = input_z + h * dilation.y();
-            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
-            {
-                int offs = input_offset;
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const int32_t current_w = input_y + w * dilation.x();
-                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
-                    {
-                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
-                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
-                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
-                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-                        {
-                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
-                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
-                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
-                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
-                        }
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-        {
-            if(has_biases)
-            {
-                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
-
-                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
-                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
-            }
-
-            if(out_shift < 0)
-            {
-                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-            }
-            else
-            {
-                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
-            }
-
-            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
-            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
-
-            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
-                                                   wrapper::vmovn(acc1.at(i)));
-
-            if(std::is_same<T, uint8_t>::value)
-            {
-                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
-            }
-            else
-            {
-                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                          const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
-
-    if(is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
-
-        if(is_data_type_quantized_asymmetric(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
-    : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
-{
-}
-
-void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                                                        const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
-
-    _input            = input;
-    _weights          = weights;
-    _biases           = biases;
-    _output           = output;
-    _conv_info        = conv_info;
-    _depth_multiplier = depth_multiplier;
-    _dilation         = dilation;
-    _has_biases       = (biases != nullptr);
-
-    if(is_data_type_quantized(_input->info()->data_type()))
-    {
-        const auto input_scale  = input->info()->quantization_info().uniform().scale;
-        const auto output_scale = output->info()->quantization_info().uniform().scale;
-
-        auto weights_scale = weights->info()->quantization_info().scale();
-        if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
-        {
-            for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)
-            {
-                weights_scale.push_back(weights_scale.front());
-            }
-        }
-
-        for(const auto &s : weights_scale)
-        {
-            int32_t     out_mult   = 0;
-            int32_t     out_shift  = 0;
-            const float multiplier = input_scale * s / output_scale;
-            arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
-
-            _output_multiplier.push_back(out_mult);
-            _output_shift.push_back(out_shift);
-        }
-    }
-
-    switch(_weights->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
-            break;
-        case DataType::QSYMM8_PER_CHANNEL:
-            if(_input->info()->data_type() == DataType::QASYMM8)
-            {
-                _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
-            }
-            else
-            {
-                _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
-
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
-
-    Window win = calculate_max_window(*output->info(), Steps());
-    INEKernel::configure(win);
-}
-
-Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                         unsigned int  depth_multiplier,
-                                                         const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
-    return Status{};
-}
-
-void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window, _has_biases);
-}
-
-template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
-void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
-    }
-    else
-    {
-        depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases);
-    }
-}
-
-template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
-void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
-    }
-    else
-    {
-        const bool is_pow2                 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
-        const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(_weights->info()->data_type()));
-
-        if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
-        {
-            depthwise_loop_pow2_quantized_per_tensor<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-        else
-        {
-            depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
deleted file mode 100644
index 713cdcd9d9..0000000000
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/INEKernel.h"
-#include "support/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionLayerNativeKernel";
-    }
-    /** Default constructor */
-    NEDepthwiseConvolutionLayerNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayerNativeKernel(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayerNativeKernel &operator=(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionLayerNativeKernel(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionLayerNativeKernel &operator=(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Default destructor */
-    ~NEDepthwiseConvolutionLayerNativeKernel() = default;
-    /** Initialize the function's source, destination and parameters.
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                              Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                   const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerNativeKernel
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in] input            Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
-     *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
-
-    template <typename T, typename TW, FloatEnalber<T> = 0>
-    void run_depthwise(const Window &window, bool has_biases);
-
-    template <typename T>
-    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
-
-    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
-    void run_depthwise(const Window &window, bool has_biases);
-
-    /** Common signature for all the specialised depthwise convolution native functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases);
-
-    DepthwiseFunctionPtr _func;
-    const ITensor       *_input;
-    const ITensor       *_weights;
-    const ITensor       *_biases;
-    ITensor             *_output;
-    PadStrideInfo        _conv_info;
-    unsigned int         _depth_multiplier;
-    Size2D               _dilation;
-    std::vector<int>     _output_multiplier;
-    std::vector<int>     _output_shift;
-    bool                 _has_biases;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
index 761258941d..eb38c18cff 100644
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/core/cpu/kernels/CpuActivationKernel.cpp
@@ -205,7 +205,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
 
 void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
 
     _act_info = activation_info;
 
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
new file mode 100644
index 0000000000..a5d1b61c08
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+constexpr auto data_layout = DataLayout::NHWC;
+const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
+constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
+constexpr size_t vector_size          = 8;
+
+struct DepthwiseConvolutionRunInfo
+{
+    const size_t   num_read_elements_per_iteration;
+    const uint32_t x_start;
+    const uint32_t x_end;
+    const uint32_t x_step;
+    const uint32_t x_leftover_start;
+    const size_t   input_stride_y;
+    const size_t   input_stride_z;
+    const size_t   input_max_offset;
+    const size_t   weights_width;
+    const size_t   weights_height;
+    const size_t   weights_stride_y;
+    const size_t   weights_stride_z;
+    const size_t   conv_stride_x;
+    const size_t   conv_stride_y;
+    const size_t   conv_pad_left;
+    const size_t   conv_pad_top;
+    const size_t   input_height;
+    const size_t   input_width;
+    const size_t   input_depth;
+
+    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
+        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+          x_start(w.x().start()),
+          x_end(w.x().end()),
+          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
+          x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
+          input_stride_y(input.strides_in_bytes().y()),
+          input_stride_z(input.strides_in_bytes().z()),
+          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+          weights_width(weights.dimension(width_idx)),
+          weights_height(weights.dimension(height_idx)),
+          weights_stride_y(weights.strides_in_bytes().y()),
+          weights_stride_z(weights.strides_in_bytes().z()),
+          conv_stride_x(conv_info.stride().first),
+          conv_stride_y(conv_info.stride().second),
+          conv_pad_left(conv_info.pad_left()),
+          conv_pad_top(conv_info.pad_top()),
+          input_height(input.dimension(height_idx)),
+          input_width(input.dimension(width_idx)),
+          input_depth(input.dimension(channel_idx))
+    {
+    }
+};
+
+inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+{
+    const int32_t current_h  = base_h + h * dilation.y();
+    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
+
+    const int32_t current_w  = base_w + w * dilation.x();
+    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
+
+    return is_valid_h && is_valid_w;
+}
+
+template <typename T>
+void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                   const Size2D &dilation, const Window &window, bool has_biases)
+{
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+
+    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
+
+    Window win_input = window;
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = win_input;
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
+
+    Iterator input_it(input, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(output, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto const base_weights_ptr = weights_it.ptr();
+        uint32_t   x                = run_info.x_start;
+
+        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+        {
+            VectorType acc          = zero_vector;
+            auto       weights_ptr  = base_weights_ptr;
+            int64_t    input_offset = base_input_offset;
+
+            for(uint32_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(uint32_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ?
+                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+                                                 zero_vector;
+                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            if(has_biases)
+            {
+                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                acc                    = wrapper::vadd(acc, biases_vals);
+            }
+
+            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
+        }
+
+        for(; x < run_info.x_end; ++x)
+        {
+            auto    acc_scalar   = T{ 0 };
+            auto    weights_ptr  = base_weights_ptr;
+            int64_t input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
+                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    acc_scalar += (input_vals * weights_vals);
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            if(has_biases)
+            {
+                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                acc_scalar += biases_vals;
+            }
+            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T>
+void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
+{
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(input, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(output, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
+
+        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto weights_ptr = weights_it.ptr();
+        for(size_t h = 0; h < run_info.weights_height; ++h)
+        {
+            int offs = input_offset;
+            for(size_t w = 0; w < run_info.weights_width; ++w)
+            {
+                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
+
+                for(size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                }
+
+                offs += dilation.x() * run_info.input_stride_y;
+            }
+
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
+        }
+
+        if(has_biases)
+        {
+            for(size_t m = 0; m < depth_multiplier; ++m)
+            {
+                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+            }
+        }
+        else
+        {
+            for(size_t m = 0; m < depth_multiplier; ++m)
+            {
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+            }
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+{
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+    using AccType                     = int32_t;
+    using AccArrayType                = std::array<AccType, element_per_vector>;
+
+    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+
+    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
+
+    Window win_input = window;
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = win_input;
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
+
+    Iterator input_it(input, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(output, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+        auto const    base_weights_ptr  = weights_it.ptr();
+        size_t        x                 = run_info.x_start;
+
+        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+        {
+            AccArrayType acc{};
+            AccArrayType in_sum{};
+            AccArrayType we_sum{};
+
+            auto weights_ptr  = base_weights_ptr;
+            auto input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ?
+                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+                                                 out_of_bound_vector;
+                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    for(size_t i = 0; i < element_per_vector; ++i)
+                    {
+                        acc.at(i) += input_vals[i] * weights_vals[i];
+                        in_sum.at(i) += input_vals[i];
+                        we_sum.at(i) += weights_vals[i];
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+            for(size_t i = 0; i < element_per_vector; ++i)
+            {
+                acc.at(i) -= in_sum.at(i) * weights_qoffset;
+                acc.at(i) -= we_sum.at(i) * input_qoffset;
+                acc.at(i) += k_offset;
+
+                if(has_biases)
+                {
+                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                }
+
+                const int32_t out_mul   = output_multiplier.at(x + i);
+                const int32_t out_shift = output_shift.at(x + i);
+                if(out_shift < 0)
+                {
+                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
+                }
+                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+            }
+
+            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
+        }
+
+        // left-over
+        for(; x < run_info.x_end; ++x)
+        {
+            AccType acc    = 0;
+            AccType in_sum = 0;
+            AccType we_sum = 0;
+
+            auto weights_ptr  = base_weights_ptr;
+            auto input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val       = is_valid_region ?
+                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
+                                                 out_of_bound_value;
+                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    acc += input_val * weights_val;
+                    in_sum += input_val;
+                    we_sum += weights_val;
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            T out_vals{ 0 };
+
+            acc -= in_sum * weights_qoffset;
+            acc -= we_sum * input_qoffset;
+            acc += k_offset;
+
+            if(has_biases)
+            {
+                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+            }
+
+            const int32_t out_mul   = output_multiplier.at(x);
+            const int32_t out_shift = output_shift.at(x);
+
+            if(out_shift < 0)
+            {
+                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
+            }
+            else
+            {
+                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+            }
+
+            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
+            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+{
+    using AccType = int32_t;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+
+    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(input, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(output, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        std::vector<AccType> acc(depth_multiplier, 0);
+        std::vector<AccType> we_sum(depth_multiplier, 0);
+        AccType              in_sum = 0;
+
+        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto weights_ptr = weights_it.ptr();
+        for(size_t h = 0; h < run_info.weights_height; ++h)
+        {
+            int offs = input_offset;
+            for(size_t w = 0; w < run_info.weights_width; ++w)
+            {
+                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
+
+                for(size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                    acc.at(m) += input_val * weights_val;
+
+                    we_sum.at(m) += weights_val;
+                }
+
+                offs += dilation.x() * run_info.input_stride_y;
+                in_sum += input_val;
+            }
+
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
+        }
+
+        for(size_t m = 0; m < depth_multiplier; ++m)
+        {
+            acc.at(m) -= in_sum * weights_qoffset;
+            acc.at(m) -= we_sum.at(m) * input_qoffset;
+            acc.at(m) += k_offset;
+
+            if(has_biases)
+            {
+                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+            }
+
+            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
+            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+            if(out_shift < 0)
+            {
+                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+            }
+            else
+            {
+                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
+            }
+            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+{
+    constexpr int half_vec = vector_size / 2;
+
+    using AccType          = int32_t;
+    using AccVectorType    = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
+    using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
+    using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
+    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+    const auto output_qoffset_vec  = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+
+    const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
+    const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
+    const auto zero  = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
+
+    const auto out_mul   = output_multiplier.at(0);
+    const auto out_shift = output_shift.at(0);
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(input, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(output, win_output);
+    Iterator biases_it{};
+
+    if(has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
+    std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
+
+    execute_window_loop(execution_window, [&](const Coordinates & id)
+    {
+        std::fill(begin(acc0), end(acc0), zero);
+        std::fill(begin(acc1), end(acc1), zero);
+
+        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+        auto weights_ptr = weights_it.ptr();
+        for(size_t h = 0; h < run_info.weights_height; ++h)
+        {
+            const int32_t current_h = input_z + h * dilation.y();
+            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+            {
+                int offs = input_offset;
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const int32_t current_w = input_y + w * dilation.x();
+                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+                    {
+                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
+                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
+
+                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                        {
+                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
+                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
+                        }
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+            }
+
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
+        }
+
+        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+        {
+            if(has_biases)
+            {
+                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+
+                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+            }
+
+            if(out_shift < 0)
+            {
+                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
+                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
+            }
+            else
+            {
+                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
+                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
+            }
+
+            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+
+            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
+                                                   wrapper::vmovn(acc1.at(i)));
+
+            if(std::is_same<T, uint8_t>::value)
+            {
+                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+            }
+            else
+            {
+                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
+            }
+        }
+    },
+    input_it, weights_it, biases_it, output_it);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > input->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > input->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * info.depth_multiplier) != weights->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
+
+    if(is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    }
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
+
+        if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+    }
+
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CpuDepthwiseConvolutionNativeKernel::CpuDepthwiseConvolutionNativeKernel()
+    : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
+{
+}
+
+void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, weights, (biases != nullptr) ? biases : nullptr, output, info));
+
+    _conv_info        = info.pad_stride_info;
+    _depth_multiplier = info.depth_multiplier;
+    _dilation         = info.dilation;
+    _has_biases       = (biases != nullptr);
+
+    if(is_data_type_quantized(input->data_type()))
+    {
+        const auto input_scale  = input->quantization_info().uniform().scale;
+        const auto output_scale = output->quantization_info().uniform().scale;
+
+        auto weights_scale = weights->quantization_info().scale();
+        if(!is_data_type_quantized_per_channel(weights->data_type()))
+        {
+            for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
+            {
+                weights_scale.push_back(weights_scale.front());
+            }
+        }
+
+        for(const auto &s : weights_scale)
+        {
+            int32_t     out_mult   = 0;
+            int32_t     out_shift  = 0;
+            const float multiplier = input_scale * s / output_scale;
+            arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
+
+            _output_multiplier.push_back(out_mult);
+            _output_shift.push_back(out_shift);
+        }
+    }
+
+    switch(weights->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
+            break;
+        case DataType::QASYMM8_SIGNED:
+            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+            break;
+        case DataType::QSYMM8_PER_CHANNEL:
+            if(input->data_type() == DataType::QASYMM8)
+            {
+                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
+            }
+            else
+            {
+                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+            }
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
+
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
+
+    Window win = calculate_max_window(*output, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuDepthwiseConvolutionNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, info));
+    return Status{};
+}
+
+template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
+void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                        ITensor *dst, const Window &window, bool has_biases)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    if(_depth_multiplier == 1)
+    {
+        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
+    }
+    else
+    {
+        depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
+    }
+}
+
+template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
+void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                        ITensor *dst, const Window &window, bool has_biases)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    if(_depth_multiplier == 1)
+    {
+        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
+    }
+    else
+    {
+        const bool is_pow2                 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
+        const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
+
+        if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
+        {
+            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
+        }
+        else
+        {
+            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
+        }
+    }
+}
+
+void CpuDepthwiseConvolutionNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
+    (this->*_func)(src, weights, biases, dst, window, _has_biases);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
new file mode 100644
index 0000000000..242536d441
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
+
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "support/Requires.h"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_neon.h>
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to run a depthwise convolution native on a tensor. */
+class CpuDepthwiseConvolutionNativeKernel : public ICpuKernel
+{
+public:
+    const char *name() const override
+    {
+        return "CpuDepthwiseConvolutionNativeKernel";
+    }
+    /** Default constructor */
+    CpuDepthwiseConvolutionNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConvolutionNativeKernel);
+
+    /** Initialize the function's source, destination and parameters.
+     *
+     * @note Supported data layouts: NHWC
+     *
+     * @param[in]  input   Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
+     *                     Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  info    Depthwise convolution meta-data.
+     *
+     */
+    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionNativeKernel
+     *
+     * @note Supported data layouts: NHWC
+     *
+     * @param[in] input   Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
+     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases  Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output  Destination tensor info. Data type supported: Same as @p input.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    template <typename T>
+    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
+
+    template <typename T, typename TW, FloatEnalber<T> = 0>
+    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+    template <typename T>
+    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
+
+    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
+    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+    /** Common signature for all the specialised depthwise convolution native functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DepthwiseFunctionPtr = void (CpuDepthwiseConvolutionNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+    DepthwiseFunctionPtr _func;
+    PadStrideInfo        _conv_info;
+    unsigned int         _depth_multiplier;
+    Size2D               _dilation;
+    std::vector<int>     _output_multiplier;
+    std::vector<int>     _output_shift;
+    bool                 _has_biases;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H */
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index f7517a50a3..8e3d010786 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -82,9 +82,10 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh
 
     if(needs_permute)
     {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+        TensorShape           permuted_input_shape   = input->tensor_shape();
+        TensorShape           permuted_weights_shape = weights->tensor_shape();
+        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+        TensorShape           permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
         permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
         permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
@@ -272,9 +273,10 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
 
     if(needs_permute)
     {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+        TensorShape           permuted_input_shape   = input->tensor_shape();
+        TensorShape           permuted_weights_shape = weights->tensor_shape();
+        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+        TensorShape           permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index d17f6b5cd9..e1ceb0f083 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,54 +27,39 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
+
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
-    if(biases != nullptr)
+    ITensor       *src{ nullptr }; // SRC_0
+    ITensor       *dst{ nullptr }; // DST_0
+    const ITensor *weights
     {
-        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
-
-    //Validate Activation Layer
-    if(act_info.enabled())
+        nullptr
+    }; // SRC_1
+    const ITensor *biases
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-    return Status{};
-}
-} // namespace
-
-NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
+        nullptr
+    };                                                                // SRC_2
+    Tensor                                        permuted_input{};   // INT_0
+    Tensor                                        permuted_weights{}; // INT_1
+    Tensor                                        permuted_output{};  // INT_2
+    Tensor                                        workspace{};        // INT_3
+    Tensor                                        packed_weights{};   // INT_4
+    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+    bool                                          is_prepared{ false };
+    bool                                          permute{ false };
+};
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _dwc_optimized_func(memory_manager), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
-      _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+    : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
 {
 }
 
@@ -87,65 +72,76 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
                                                                                           const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
-
-    _original_weights           = weights;
-    _is_quantized               = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias                   = biases != nullptr;
-    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
-    _permute                    = _is_nchw;
-    _is_prepared                = false;
-    _is_activationlayer_enabled = act_info.enabled();
+
+    bool is_nhwc   = input->info()->data_layout() == DataLayout::NCHW;
+    _impl->src     = input;
+    _impl->weights = weights;
+    _impl->biases  = biases;
+    _impl->dst     = output;
+    _impl->permute = is_nhwc;
+
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
+                         _impl->dst->info(), info);
 
     // Configure pipeline
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
-    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
-    if(!_is_activationlayer_enabled)
+    ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
+    const bool          is_relu                    = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
+    bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
+
+    if(!is_activationlayer_enabled)
     {
         act_info_to_use = act_info;
     }
+    info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
 
-    if(_is_nchw)
+    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConvolutionAssemblyDispatch>();
+
+    if(is_nhwc)
     {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
+        auto permute_input   = std::make_unique<cpu::CpuPermute>();
+        auto permute_weights = std::make_unique<cpu::CpuPermute>();
+        auto permute_output  = std::make_unique<cpu::CpuPermute>();
+
+        _memory_group.manage(&_impl->permuted_input);
+        _memory_group.manage(&_impl->permuted_weights);
+        _memory_group.manage(&_impl->permuted_output);
 
         // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
 
         // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
 
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         // Configure optimized depthwise
-        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases->info(), _impl->permuted_output.info(), info);
 
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
 
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
+        _impl->permuted_input.allocator()->allocate();
+        _impl->permuted_output.allocator()->allocate();
     }
     else
     {
-        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
+        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases->info(), _impl->dst->info(), info);
     }
 
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
+    // Allocate memory based on the internal memory requirements
+    experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
+    _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size }, 1, DataType::S8), mem_req[0].alignment);
+    _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size }, 1, DataType::S8), mem_req[1].alignment);
+
+    _impl->workspace.allocator()->allocate();
+    _impl->packed_weights.allocator()->allocate();
 }
 
 Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
@@ -157,63 +153,66 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
                                                                                            const ActivationLayerInfo &act_info,
                                                                                            const Size2D              &dilation)
 {
-    return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
 {
     prepare();
-
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    // Permute input
-    if(_permute)
-    {
-        _permute_input.run();
-    }
-
-    // Run assembly function
-    _dwc_optimized_func.run();
-
-    // Permute output
-    if(_is_nchw)
-    {
-        _permute_output.run();
-    }
-
-    // Run activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+    pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+    pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+    pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
+    pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
-    if(!_is_prepared)
+    if(!_impl->is_prepared)
     {
         // Permute weights
-        if(_permute)
+        if(_impl->permute)
         {
-            _permuted_weights.allocator()->allocate();
-            _permute_weights.run();
-            _original_weights->mark_as_unused();
+            _impl->permuted_weights.allocator()->allocate();
+            _impl->weights->mark_as_unused();
         }
 
-        // Prepare optimized function
-        _dwc_optimized_func.prepare();
-        if(!_permuted_weights.is_used())
+        if(!_impl->permuted_weights.is_used())
         {
-            _permuted_weights.allocator()->free();
+            _impl->permuted_weights.allocator()->free();
         }
 
-        _is_prepared = true;
+        _impl->is_prepared = true;
     }
 }
 
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
+{
+    Tensor                                        permuted_input{};
+    Tensor                                        permuted_weights{};
+    Tensor                                        permuted_output{};
+    bool                                          is_prepared{ false };
+    bool                                          is_nchw{ false };
+    bool                                          is_activationlayer_enabled{ false };
+    const ITensor                                *weights{ nullptr };
+    const ITensor                                *biases{ nullptr };
+    const ITensor                                *src{ nullptr };
+    ITensor                                      *dst{ nullptr };
+    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+};
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
-    : _depthwise_conv_kernel(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
-      _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+    : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -224,45 +223,49 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
     ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
 
-    _is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
-    _is_prepared = !_is_nchw;
+    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
+
+    _impl->src         = input;
+    _impl->dst         = output;
+    _impl->weights     = weights;
+    _impl->biases      = biases;
+    _impl->is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
+    _impl->is_prepared = !_impl->is_nchw;
 
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
-    if(_is_nchw)
+    if(_impl->is_nchw)
     {
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
-        input_to_use = &_permuted_input;
+        auto permute_input   = std::make_unique<cpu::CpuPermute>();
+        auto permute_weights = std::make_unique<cpu::CpuPermute>();
 
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
-        weights_to_use = &_permuted_weights;
+        permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        input_to_use = &_impl->permuted_input;
 
-        _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
-        output_to_use = &_permuted_output;
+        permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        weights_to_use = &_impl->permuted_weights;
+
+        _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+        output_to_use = &_impl->permuted_output;
     }
-    _original_weights = weights_to_use;
 
-    _depthwise_conv_kernel = std::make_unique<NEDepthwiseConvolutionLayerNativeKernel>();
-    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
+    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases->info(), output_to_use->info(), info);
 
-    if(_is_nchw)
+    if(_impl->is_nchw)
     {
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        auto permute_output = std::make_unique<cpu::CpuPermute>();
+        permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
 
-        _permuted_input.allocator()->allocate();
-        _permuted_weights.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        _impl->permuted_input.allocator()->allocate();
+        _impl->permuted_weights.allocator()->allocate();
+        _impl->permuted_output.allocator()->allocate();
     }
 }
 
@@ -270,89 +273,53 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate
                                                                                  const PadStrideInfo &conv_info,
                                                                                  unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
-        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, dilation));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, dilation));
-    }
-
-    // Validate Activation Layer
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-
-    return Status{};
+    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
 {
-    if(_is_nchw)
-    {
-        prepare();
-        _permute_input.run();
-    }
-
-    NEScheduler::get().schedule(_depthwise_conv_kernel.get(), Window::DimY);
-
-    if(_is_nchw)
-    {
-        _permute_output.run();
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+    pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+    pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::prepare()
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
 {
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _permute_weights.run();
-        _original_weights->mark_as_unused();
-        _is_prepared = true;
-    }
 }
 
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(std::move(memory_manager)), _func_generic()
+#ifndef DOXYGEN_SKIP_THIS
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-}
+    DepthwiseConvolutionFunction                  depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
+    NEDepthwiseConvolutionLayerOptimizedInternal  func_optimized{ nullptr };
+    NEDepthwiseConvolutionLayerGeneric            func_generic{};
+    std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+};
+#endif // DOXYGEN_SKIP_THIS
 
 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                                             const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
-    _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation);
-    switch(_depth_conv_func)
+    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    _impl->op              = std::make_shared<cpu::CpuDepthwiseConvolution>();
+    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+                                                                          info);
+    switch(_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
@@ -362,43 +329,19 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
 Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                              unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    switch(depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            return NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            return NEDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-DepthwiseConvolutionFunction NEDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                            const PadStrideInfo &conv_info,
-                                                                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    if(bool(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)))
-    {
-        return DepthwiseConvolutionFunction::OPTIMIZED;
-    }
-    else
-    {
-        return DepthwiseConvolutionFunction::GENERIC;
-    }
+    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
 {
-    switch(_depth_conv_func)
+    switch(_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.run();
+            _impl->func_optimized.run();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.run();
+            _impl->func_generic.run();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
@@ -407,13 +350,13 @@ void NEDepthwiseConvolutionLayer::run()
 
 void NEDepthwiseConvolutionLayer::prepare()
 {
-    switch(_depth_conv_func)
+    switch(_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.prepare();
+            _impl->func_optimized.prepare();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.prepare();
+            _impl->func_generic.prepare();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
deleted file mode 100644
index 101df98b7d..0000000000
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ /dev/null
@@ -1,569 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
-#include "src/core/helpers/AutoConfiguration.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace
-{
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
-                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
-                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
-                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                                  neon_convolution_kernels::ActivationFunction activation,
-                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
-                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor      *input,
-                                                                   const ITensor      *weights,
-                                                                   ITensor            *output,
-                                                                   PadStrideInfo       conv_info,
-                                                                   ActivationLayerInfo act_info,
-                                                                   const Size2D       &dilation)
-{
-    ARM_COMPUTE_UNUSED(dilation);
-    const DataType    data_type = input->info()->data_type();
-    const TensorShape shape     = input->info()->tensor_shape();
-
-    const int n_batches       = shape[3];
-    const int in_rows         = shape.z();
-    const int in_cols         = shape.y();
-    const int n_channels      = shape.x();
-    const int dilation_factor = dilation.x();
-    const int padding_top     = conv_info.pad_top();
-    const int padding_left    = conv_info.pad_left();
-    const int padding_bottom  = conv_info.pad_bottom();
-    const int padding_right   = conv_info.pad_right();
-
-    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QASYMM8);
-    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
-
-    const unsigned int stride_x    = conv_info.stride().first;
-    const unsigned int kernel_size = weights->info()->tensor_shape().y();
-
-    // Map activation function
-    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
-    if(arm_compute::utils::info_helpers::is_relu(act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU;
-    }
-    else if(arm_compute::utils::info_helpers::is_relu6(act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
-    }
-
-    // Create quantized convolver
-    if(is_uniform_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo weights_qinfo = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo output_qinfo  = output->info()->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
-        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int32_t     qmultiplier = 0;
-        int32_t     qshift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
-
-        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
-                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else if(is_perchannel_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
-        const QuantizationInfo        weights_qinfo = weights->info()->quantization_info();
-        const UniformQuantizationInfo output_qinfo  = output->info()->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
-        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        std::vector<float>   fmultipliers;
-        std::vector<int32_t> qmultipliers;
-        std::vector<int32_t> qshifts;
-
-        for(auto const s : wqinfo.scales)
-        {
-            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
-            int32_t     qmultiplier = 0;
-            int32_t     qshift      = 0;
-            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-            fmultipliers.push_back(fmultipler);
-            qmultipliers.push_back(qmultiplier);
-            qshifts.push_back(qshift);
-        }
-
-        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
-
-        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
-                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else
-    {
-        // Create float convolver
-        switch(data_type)
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F32:
-            {
-                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-            default:
-                return nullptr;
-        }
-    }
-}
-} // namespace
-
-struct NEDepthwiseConvolutionAssemblyDispatch::LocalImpl
-{
-    std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel{ nullptr };
-    NEDepthwiseConvolutionAssemblyKernelWrapper       _dwc_acl_kernel{};
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false),
-      _pImpl(std::make_unique<LocalImpl>())
-{
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-NEDepthwiseConvolutionAssemblyDispatch::~NEDepthwiseConvolutionAssemblyDispatch() = default;
-
-void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor             *input,
-                                                       const ITensor             *weights,
-                                                       const ITensor             *bias,
-                                                       ITensor                   *output,
-                                                       const PadStrideInfo       &conv_info,
-                                                       unsigned int               depth_multiplier,
-                                                       const ActivationLayerInfo &act_info,
-                                                       const Size2D              &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(depth_multiplier);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
-                                                                                weights->info(),
-                                                                                bias != nullptr ? bias->info() : nullptr,
-                                                                                output->info(),
-                                                                                conv_info,
-                                                                                depth_multiplier,
-                                                                                act_info,
-                                                                                dilation));
-
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
-
-    _input       = input;
-    _weights     = weights;
-    _bias        = bias;
-    _output      = output;
-    _is_prepared = false;
-
-    // Create convolver
-    _pImpl->_dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info, dilation);
-    ARM_COMPUTE_ERROR_ON(_pImpl->_dwc_assembly_kernel == nullptr);
-
-    // Create assembly kernel wrapper
-    _pImpl->_dwc_acl_kernel.configure(_pImpl->_dwc_assembly_kernel.get());
-
-    constexpr size_t alignment = 128;
-
-    // Create workspace
-    const unsigned int num_threads    = NEScheduler::get().num_threads();
-    const size_t       workspace_size = _pImpl->_dwc_assembly_kernel->get_working_space_size(num_threads);
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
-    _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
-    _memory_group.manage(&_workspace);
-    _workspace.allocator()->allocate();
-
-    // Create packing tensor
-    const size_t pack_tensor_size = _pImpl->_dwc_assembly_kernel->get_packed_params_size();
-    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
-    _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
-}
-
-Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo         *input,
-                                                        const ITensorInfo         *weights,
-                                                        const ITensorInfo         *bias,
-                                                        const ITensorInfo         *output,
-                                                        const PadStrideInfo       &conv_info,
-                                                        unsigned int               depth_multiplier,
-                                                        const ActivationLayerInfo &act_info,
-                                                        const Size2D              &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-    // Validate convolver
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation));
-
-    // Validate activation
-    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(act_info);
-    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
-
-    // Check bias
-    if(bias != nullptr)
-    {
-        unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    // Check output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    // The uniform quantization case will only have 1 scale value in the weights quantization info
-    const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-    const QuantizationInfo        weights_qinfo = weights->quantization_info();
-    const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-    for(auto const s : weights_qinfo.scale())
-    {
-        const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
-        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
-    }
-
-    return Status{};
-}
-
-bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
-                                                                    const ITensorInfo *weights,
-                                                                    PadStrideInfo      conv_info,
-                                                                    unsigned int       depth_multiplier,
-                                                                    const Size2D      &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-
-    // Reshape input shape if in NHWC format
-    const DataLayout data_layout = input->data_layout();
-    TensorShape      in_shape{ input->tensor_shape() };
-    if(data_layout == DataLayout::NHWC)
-    {
-        in_shape.set(Window::DimX, input->tensor_shape().y());
-        in_shape.set(Window::DimY, input->tensor_shape().z());
-        in_shape.set(Window::DimZ, input->tensor_shape().x());
-    }
-
-    // Check data type
-    // TODO (COMPMID-3004): Add assembly optimized routine for QASYMM8_SIGNED NEDepthwiseConvolutionLayer
-    const DataType input_type            = input->data_type();
-    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
-    const DataType weights_type          = weights->data_type();
-    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
-                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
-
-    // Check weighs size
-    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
-    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int     kernel_w               = weights->dimension(width_idx);
-    const unsigned int     kernel_h               = weights->dimension(height_idx);
-    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
-
-    // Check for supported strides
-    const auto &strides           = conv_info.stride();
-    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
-    // Check for supported padding
-    const auto    pad_top           = conv_info.pad_top();
-    const auto    pad_right         = conv_info.pad_right();
-    const auto    pad_bottom        = conv_info.pad_bottom();
-    const auto    pad_left          = conv_info.pad_left();
-    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), conv_info, DataLayout::NCHW, dilation);
-    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
-    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
-    bool          supported_padding = is_same_padding || is_valid_padding;
-    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-    bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1));
-
-    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
-    {
-        is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U));
-    }
-
-    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
-}
-
-void NEDepthwiseConvolutionAssemblyDispatch::run()
-{
-    // Prepare assembly kernel
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Setup inputs/outputs
-    ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
-    _pImpl->_dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
-
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-    const int   input_element_size = _input->info()->element_size();
-    const int   input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
-    const int   input_row_stride   = _input->info()->strides_in_bytes().z() / input_element_size;
-    const int   input_col_stride   = _input->info()->strides_in_bytes().y() / input_element_size;
-    const void *input_ptr          = _input->buffer() + _input->info()->offset_first_element_in_bytes();
-    _pImpl->_dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
-
-    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-    const int output_element_size = _output->info()->element_size();
-    const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
-    const int output_row_stride   = _output->info()->strides_in_bytes().z() / output_element_size;
-    const int output_col_stride   = _output->info()->strides_in_bytes().y() / output_element_size;
-    void     *output_ptr          = _output->buffer() + _output->info()->offset_first_element_in_bytes();
-    _pImpl->_dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
-
-    // Schedule assembly kernel
-    NEScheduler::get().schedule(&_pImpl->_dwc_acl_kernel, Window::DimX);
-}
-
-void NEDepthwiseConvolutionAssemblyDispatch::prepare()
-{
-    if(!_is_prepared)
-    {
-        _packed_weights.allocator()->allocate();
-        ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
-
-        // Pack weights and bias
-        const int weights_element_size = _weights->info()->element_size();
-        const int weights_row_stride   = _weights->info()->strides_in_bytes().z() / weights_element_size;
-        const int weights_col_stride   = _weights->info()->strides_in_bytes().y() / weights_element_size;
-        _pImpl->_dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
-                                                  _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
-                                                  weights_row_stride,
-                                                  weights_col_stride,
-                                                  (_bias != nullptr) ? _bias->buffer() : nullptr);
-        _pImpl->_dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
-
-        _weights->mark_as_unused();
-        if(_bias != nullptr)
-        {
-            _bias->mark_as_unused();
-        }
-        _is_prepared = true;
-    }
-}
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
new file mode 100644
index 0000000000..183a2af0cd
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    if(!is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > input->dimension(idx_w) + info.pad_stride_info.pad_left() +
+                                info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > input->dimension(idx_h) + info.pad_stride_info.pad_top() +
+                                info.pad_stride_info.pad_bottom());
+
+    if(biases != nullptr)
+    {
+        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, info));
+
+    //Validate Activation Layer
+    if(info.act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
+    }
+    return Status{};
+}
+} // namespace
+
+CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::CpuDepthwiseConvolutionOptimizedInternal()
+    : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configure(ITensorInfo           *input,
+                                                                                  const ITensorInfo     *weights,
+                                                                                  const ITensorInfo     *biases,
+                                                                                  ITensorInfo           *output,
+                                                                                  const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, (biases == nullptr) ? nullptr : biases,
+                                                                                  output, info));
+
+    _is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    _has_bias     = biases != nullptr;
+    _is_nchw      = input->data_layout() == DataLayout::NCHW;
+    _permute      = _is_nchw;
+    _is_prepared  = false;
+
+    // Configure pipeline
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(info.act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+    _is_activationlayer_enabled         = info.act_info.enabled() && !(is_relu || is_relu6);
+
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = info.act_info;
+    }
+
+    _dwc_optimized_func = std::make_unique<CpuDepthwiseConvolutionAssemblyDispatch>();
+    if(_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+        _permute_output  = std::make_unique<cpu::CpuPermute>();
+
+        auto input_perm   = std::make_unique<TensorInfo>();
+        auto weights_perm = std::make_unique<TensorInfo>();
+        auto output_perm  = std::make_unique<TensorInfo>();
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+
+        output_perm->set_data_layout(DataLayout::NHWC);
+        output_perm->set_quantization_info(output->quantization_info());
+
+        // Configure optimized depthwise
+        _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        output_perm->set_data_layout(DataLayout::NHWC);
+        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
+    }
+    else
+    {
+        _dwc_optimized_func->configure(input, weights, biases, output, info);
+    }
+
+    // Configure activation
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(output, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::validate(const ITensorInfo     *input,
+                                                                                   const ITensorInfo     *weights,
+                                                                                   const ITensorInfo     *biases,
+                                                                                   const ITensorInfo     *output,
+                                                                                   const ConvolutionInfo &info)
+{
+    return validate_arguments_optimized(input, weights, biases, output, info);
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    prepare(tensors);
+
+    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst            = tensors.get_tensor(TensorType::ACL_DST_0);
+    auto workspace      = tensors.get_tensor(TensorType::ACL_INT_3);
+    auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+    // Permute input
+    if(_permute)
+    {
+        ITensorPack pack;
+        auto        src      = tensors.get_tensor(TensorType::ACL_SRC_0);
+        auto        src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+    }
+
+    // Run assembly function
+    if(_is_nchw)
+    {
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst_perm);
+        _dwc_optimized_func->run(pack);
+    }
+    else
+    {
+        auto src     = tensors.get_tensor(TensorType::ACL_SRC_0);
+        auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
+        auto dst     = tensors.get_tensor(TensorType::ACL_DST);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _dwc_optimized_func->run(pack);
+    }
+
+    // Permute output
+    if(_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    // Run activation
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+        // Permute weights
+        if(_permute)
+        {
+            auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, weights);
+            pack.add_tensor(TensorType::ACL_DST, permuted_weights);
+            _permute_weights->run(pack);
+
+            ITensorPack pack_opt;
+            pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+        else
+        {
+            ITensorPack pack_opt;
+            pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+
+        _is_prepared = true;
+    }
+}
+
+CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::CpuDepthwiseConvolutionGeneric()
+    : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
+      _is_activationlayer_enabled(false)
+{
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolution::validate(input, weights, (biases == nullptr) ? nullptr : biases,
+                                                                 output, info));
+
+    _is_nchw     = input->data_layout() == DataLayout::NCHW;
+    _is_prepared = !_is_nchw;
+
+    ITensorInfo       *input_to_use   = input;
+    const ITensorInfo *weights_to_use = weights;
+    ITensorInfo       *output_to_use  = output;
+
+    auto input_perm   = std::make_unique<TensorInfo>();
+    auto weights_perm = std::make_unique<TensorInfo>();
+    auto output_perm  = std::make_unique<TensorInfo>(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+
+    if(_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+
+        _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+        input_to_use = input_perm.get();
+
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+        weights_to_use = weights_perm.get();
+
+        output_to_use = output_perm.get();
+    }
+
+    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
+
+    if(_is_nchw)
+    {
+        _permute_output = std::make_unique<cpu::CpuPermute>();
+        _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
+        output_perm->set_data_layout(DataLayout::NHWC);
+    }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = info.act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(output, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                         const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    if(input->data_layout() == DataLayout::NCHW)
+    {
+        TensorShape permuted_input_shape   = input->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(input, weights, biases, output, info));
+    }
+
+    // Validate Activation Layer
+    if(info.act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::run(ITensorPack &tensors)
+{
+    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    if(_is_nchw)
+    {
+        prepare(tensors);
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+
+        ITensorPack pack_depth;
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+    }
+    else
+    {
+        ITensorPack pack_depth;
+        pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
+        pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+    }
+
+    if(_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    if(_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        ARM_COMPUTE_ERROR_ON(!weights->is_used());
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, weights);
+        pack.add_tensor(TensorType::ACL_DST, weights_perm);
+
+        _permute_weights->run(pack);
+        weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+
+CpuDepthwiseConvolution::CpuDepthwiseConvolution()
+    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
+{
+}
+
+void CpuDepthwiseConvolution::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+{
+    _depth_conv_func = get_depthwiseconvolution_function(input, weights, (biases != nullptr) ? biases : nullptr, output, info);
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.configure(input, weights, biases, output, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.configure(input, weights, biases, output, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+Status CpuDepthwiseConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+{
+    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, info);
+    switch(depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            return CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            return CpuDepthwiseConvolutionGeneric::validate(input, weights, biases, output, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+DepthwiseConvolutionFunction CpuDepthwiseConvolution::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                                        const ConvolutionInfo &info)
+{
+    if(bool(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info)))
+    {
+        return DepthwiseConvolutionFunction::OPTIMIZED;
+    }
+    else
+    {
+        return DepthwiseConvolutionFunction::GENERIC;
+    }
+}
+
+void CpuDepthwiseConvolution::run(ITensorPack &tensors)
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.run(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.run(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+
+void CpuDepthwiseConvolution::prepare(ITensorPack &tensors)
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.prepare(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.prepare(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
new file mode 100644
index 0000000000..e39cb7db4d
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEQUANTIZATION_H
+#define ARM_COMPUTE_CPU_DEQUANTIZATION_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to execute a depthwise convolution.
+ */
+class CpuDepthwiseConvolution : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConvolution();
+    /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
+     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
+     *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      info    Depthwise convolution meta-data.
+     */
+    void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution
+     *
+     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] output  Destination tensor info. Data type supported: same as @p input.
+     * @param[in] weights Weights tensor info. These are 3D tensors info with shape [kernel_x, kernel_y, IFM].
+     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConvolution
+     *
+     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output  Destination tensor. Data type supported: same as @p input.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return a Depthwise Convolution Function
+     */
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                          const ConvolutionInfo &info);
+
+    // Inherited methods overriden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+
+private:
+    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
+    *
+    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+    *
+    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+    * -# @ref CpuDepthwiseConvolution3x3Kernel if 3x3 and no assembly kernel implementation is present
+    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
+    * -# @ref NEActivationLayer if fused activation is required
+    *
+    */
+    class CpuDepthwiseConvolutionOptimizedInternal : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConvolutionOptimizedInternal();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConvolutionOptimizedInternal(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConvolutionOptimizedInternal(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConvolutionOptimizedInternal &operator=(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConvolutionOptimizedInternal &operator=(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConvolutionOptimizedInternal() = default;
+        /** Initialize the function's source, destination, kernels and border_size.
+         *
+         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution3x3
+         *
+         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
+         * @param[in] info    Depthwise convolution meta-data.
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+        // Inherited methods overriden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<CpuDepthwiseConvolutionAssemblyDispatch> _dwc_optimized_func{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
+        bool                                                     _has_bias{ false };
+        bool                                                     _is_quantized{ false };
+        bool                                                     _is_nchw{ true };
+        bool                                                     _permute{ false };
+        bool                                                     _is_activationlayer_enabled{ false };
+        bool                                                     _is_prepared{ false };
+    };
+
+    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
+     *
+     * -# @ref CpuDepthwiseConvolutionNativeKernel
+     *
+     */
+    class CpuDepthwiseConvolutionGeneric : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConvolutionGeneric();
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConvolutionGeneric(const CpuDepthwiseConvolutionGeneric &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConvolutionGeneric(CpuDepthwiseConvolutionGeneric &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConvolutionGeneric &operator=(const CpuDepthwiseConvolutionGeneric &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConvolutionGeneric &operator=(CpuDepthwiseConvolutionGeneric &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConvolutionGeneric() = default;
+        /** Initialize the function's source, destination, weights and convolution information.
+         *
+         * @param[in, out] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[out]     output  Destination tensor info. Data type supported: same as @p input.
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                         Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionGeneric
+         *
+         * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[in] output  Destination tensor info. Data type supported: same as @p input.
+         * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+         * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+         * @param[in] info    Depthwise convolution meta-data.
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+        // Inherited methods overridden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<kernels::CpuDepthwiseConvolutionNativeKernel> _depthwise_conv_kernel{ nullptr };
+        std::unique_ptr<CpuPermute>                                   _permute_input{ nullptr };
+        std::unique_ptr<CpuPermute>                                   _permute_weights{ nullptr };
+        std::unique_ptr<CpuPermute>                                   _permute_output{ nullptr };
+        std::unique_ptr<CpuActivation>                                _activationlayer_function{ nullptr };
+        bool                                                          _is_nchw{ true };
+        bool                                                          _is_prepared{ false };
+        bool                                                          _is_activationlayer_enabled{ false };
+    };
+
+    DepthwiseConvolutionFunction             _depth_conv_func;
+    CpuDepthwiseConvolutionOptimizedInternal _func_optimized;
+    CpuDepthwiseConvolutionGeneric           _func_generic;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEQUANTIZATION_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
new file mode 100644
index 0000000000..5f5304cded
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
+                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
+                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
+                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
+                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                                  neon_convolution_kernels::ActivationFunction activation,
+                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
+                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
+                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
+                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
+                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
+                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *input,
+                                                                   const ITensorInfo     *weights,
+                                                                   ITensorInfo           *output,
+                                                                   const ConvolutionInfo &info)
+{
+    const DataType    data_type = input->data_type();
+    const TensorShape shape     = input->tensor_shape();
+
+    const int n_batches       = shape[3];
+    const int in_rows         = shape.z();
+    const int in_cols         = shape.y();
+    const int n_channels      = shape.x();
+    const int dilation_factor = info.dilation.x();
+    const int padding_top     = info.pad_stride_info.pad_top();
+    const int padding_left    = info.pad_stride_info.pad_left();
+    const int padding_bottom  = info.pad_stride_info.pad_bottom();
+    const int padding_right   = info.pad_stride_info.pad_right();
+
+    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
+    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+
+    const unsigned int stride_x    = info.pad_stride_info.stride().first;
+    const unsigned int kernel_size = weights->tensor_shape().y();
+
+    // Map activation function
+    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
+    if(arm_compute::utils::info_helpers::is_relu(info.act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU;
+    }
+    else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
+    }
+
+    // Create quantized convolver
+    if(is_uniform_quantized)
+    {
+        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
+        const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
+        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
+        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int32_t     qmultiplier = 0;
+        int32_t     qshift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
+
+        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
+                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+    }
+    else if(is_perchannel_quantized)
+    {
+        const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
+        const QuantizationInfo        weights_qinfo = weights->quantization_info();
+        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
+        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        std::vector<float>   fmultipliers;
+        std::vector<int32_t> qmultipliers;
+        std::vector<int32_t> qshifts;
+
+        for(auto const s : wqinfo.scales)
+        {
+            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
+            int32_t     qmultiplier = 0;
+            int32_t     qshift      = 0;
+            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+            fmultipliers.push_back(fmultipler);
+            qmultipliers.push_back(qmultiplier);
+            qshifts.push_back(qshift);
+        }
+
+        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
+
+        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
+                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+    }
+    else
+    {
+        // Create float convolver
+        switch(data_type)
+        {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+            }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F32:
+            {
+                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
+            }
+            default:
+                return nullptr;
+        }
+    }
+}
+} // namespace
+
+struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl
+{
+    std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
+    NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
+    bool                                              is_prepared{ false };
+    experimental::MemoryRequirements                  mem_req{};
+};
+
+#ifndef DOXYGEN_SKIP_THIS
+CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch()
+    : _pImpl(std::make_unique<LocalImpl>())
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch() = default;
+
+void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo     *input,
+                                                        const ITensorInfo     *weights,
+                                                        const ITensorInfo     *bias,
+                                                        ITensorInfo           *output,
+                                                        const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(bias);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionAssemblyDispatch::validate(input,
+                                                                                 weights,
+                                                                                 bias != nullptr ? bias : nullptr,
+                                                                                 output,
+                                                                                 info));
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
+
+    _pImpl->is_prepared = false;
+
+    // Create convolver
+    _pImpl->dwc_assembly_kernel = create_convolver(input, weights, output, info);
+    ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
+
+    // Create assembly kernel wrapper
+    _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
+
+    constexpr size_t alignment = 128;
+
+    // Create workspace
+    const unsigned int num_threads    = NEScheduler::get().num_threads();
+    const size_t       workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
+    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
+
+    // Create packing tensor
+    const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
+    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
+
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
+}
+
+experimental::MemoryRequirements CpuDepthwiseConvolutionAssemblyDispatch::workspace() const
+{
+    return _pImpl->mem_req;
+}
+
+Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo     *input,
+                                                         const ITensorInfo     *weights,
+                                                         const ITensorInfo     *bias,
+                                                         const ITensorInfo     *output,
+                                                         const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    // Validate convolver
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, info));
+
+    // Validate activation
+    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
+    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
+
+    // Check bias
+    if(bias != nullptr)
+    {
+        unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    // Check output
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    // The uniform quantization case will only have 1 scale value in the weights quantization info
+    const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
+    const QuantizationInfo        weights_qinfo = weights->quantization_info();
+    const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
+    for(auto const s : weights_qinfo.scale())
+    {
+        const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
+        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
+    }
+
+    return Status{};
+}
+
+bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo     *input,
+                                                                     const ITensorInfo     *weights,
+                                                                     const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+
+    // Reshape input shape if in NHWC format
+    const DataLayout data_layout = input->data_layout();
+    TensorShape      in_shape{ input->tensor_shape() };
+    if(data_layout == DataLayout::NHWC)
+    {
+        in_shape.set(Window::DimX, input->tensor_shape().y());
+        in_shape.set(Window::DimY, input->tensor_shape().z());
+        in_shape.set(Window::DimZ, input->tensor_shape().x());
+    }
+
+    // Check data type
+    // TODO (COMPMID-3004): Add assembly optimized routine for QASYMM8_SIGNED NEDepthwiseConvolutionLayer
+    const DataType input_type            = input->data_type();
+    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
+    const DataType weights_type          = weights->data_type();
+    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
+                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
+
+    // Check weighs size
+    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
+    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int     kernel_w               = weights->dimension(width_idx);
+    const unsigned int     kernel_h               = weights->dimension(height_idx);
+    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
+
+    // Check for supported strides
+    const auto &strides           = info.pad_stride_info.stride();
+    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+    // Check for supported padding
+    const auto    pad_top           = info.pad_stride_info.pad_top();
+    const auto    pad_right         = info.pad_stride_info.pad_right();
+    const auto    pad_bottom        = info.pad_stride_info.pad_bottom();
+    const auto    pad_left          = info.pad_stride_info.pad_left();
+    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
+    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+    bool          supported_padding = is_same_padding || is_valid_padding;
+    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
+    bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
+
+    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
+    {
+        is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
+    }
+
+    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
+}
+
+void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors)
+{
+    // Prepare assembly kernel
+    prepare(tensors);
+
+    auto src       = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+    auto dst       = tensors.get_tensor(TensorType::ACL_DST);
+
+    // Setup inputs/outputs
+    ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
+    _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
+
+    ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
+    const int   input_element_size = src->info()->element_size();
+    const int   input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
+    const int   input_row_stride   = src->info()->strides_in_bytes().z() / input_element_size;
+    const int   input_col_stride   = src->info()->strides_in_bytes().y() / input_element_size;
+    const void *input_ptr          = src->buffer() + src->info()->offset_first_element_in_bytes();
+    _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+    ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
+    const int output_element_size = dst->info()->element_size();
+    const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
+    const int output_row_stride   = dst->info()->strides_in_bytes().z() / output_element_size;
+    const int output_col_stride   = dst->info()->strides_in_bytes().y() / output_element_size;
+    void     *output_ptr          = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+
+    // Schedule assembly kernel
+    NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
+}
+
+void CpuDepthwiseConvolutionAssemblyDispatch::prepare(ITensorPack &tensors)
+{
+    if(!_pImpl->is_prepared)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
+
+        // Pack weights and bias
+        const int weights_element_size = weights->info()->element_size();
+        const int weights_row_stride   = weights->info()->strides_in_bytes().z() / weights_element_size;
+        const int weights_col_stride   = weights->info()->strides_in_bytes().y() / weights_element_size;
+        _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
+                                                 weights->buffer() + weights->info()->offset_first_element_in_bytes(),
+                                                 weights_row_stride,
+                                                 weights_col_stride,
+                                                 (bias != nullptr) ? bias->buffer() : nullptr);
+        _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
+
+        weights->mark_as_unused();
+        if(bias != nullptr)
+        {
+            bias->mark_as_unused();
+        }
+        _pImpl->is_prepared = true;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
new file mode 100644
index 0000000000..6aac74c3ef
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Depthwise convolution assembly kernel glue */
+class CpuDepthwiseConvolutionAssemblyDispatch : public ICpuOperator
+{
+public:
+    CpuDepthwiseConvolutionAssemblyDispatch();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuDepthwiseConvolutionAssemblyDispatch(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
+    /** Default move constructor */
+    CpuDepthwiseConvolutionAssemblyDispatch(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuDepthwiseConvolutionAssemblyDispatch &operator=(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
+    /** Default move assignment operator */
+    CpuDepthwiseConvolutionAssemblyDispatch &operator=(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
+    /** Default destructor */
+    ~CpuDepthwiseConvolutionAssemblyDispatch();
+    /** Initialize the function's source, destination, kernels and border_size.
+     *
+     * @note Supports only NHWC format
+     *
+     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: Same as @p input.
+     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
+     * @param[in]  info    Depthwise convolution meta-data.
+     */
+    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionAssemblyDispatch
+     *
+     * @note Supports only NHWC format
+     *
+     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: Same as @p input.
+     * @param[out] output  Destination tensor info. Data type supported: same as @p input.
+     * @param[in]  info    Depthwise convolution meta-data.
+     *
+     * @return An error status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const ConvolutionInfo &info);
+    /** Check if the optimized kernel can be used for the given kernel sizes and strides
+     *
+     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
+     *
+     * @param[in] input   Input tensor info.
+     * @param[in] weights Weights tensor info.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
+     */
+    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    struct LocalImpl;
+    std::unique_ptr<LocalImpl> _pImpl;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */
diff --git a/tests/NEON/Helper.h b/tests/NEON/Helper.h
index 714152ebcd..c9e53d11b0 100644
--- a/tests/NEON/Helper.h
+++ b/tests/NEON/Helper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,9 @@
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
 #include "tests/Globals.h"
 
 #include <algorithm>
@@ -104,7 +106,7 @@ public:
 
 /** As above but this also setups a Zero border on the input tensor of the kernel's bordersize */
 template <typename K>
-class NESynthetizeFunctionWithZeroConstantKernelBorder : public INESimpleFunction
+class NESynthetizeFunctionWithZeroConstantKernelBorder : public cpu::ICpuOperator
 {
 public:
     /** Configure the kernel.
@@ -123,6 +125,15 @@ public:
         b->configure(first, BorderSize(_kernel->border_size()), BorderMode::CONSTANT, PixelValue());
         _border_handler = std::move(b);
     }
+
+    void run(ITensorPack &tensors)
+    {
+        NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
+        NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+    }
+
+private:
+    std::unique_ptr<INEKernel> _border_handler{ nullptr };
 };
 
 } // namespace test
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index 3314227bec..5c2ebaa51f 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
@@ -37,12 +37,12 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-// Create function for NEDepthwiseConvolutionLayerKernel
-using NEDepthwiseConvolutionLayerNative = NESynthetizeFunctionWithZeroConstantKernelBorder<NEDepthwiseConvolutionLayerNativeKernel>;
+// Create function for CpuDepthwiseConvolutionKernel
+using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>;
 
 // Fixture for NEDepthwiseConvolutionLayerKernel
 template <typename T>
-using NEDepthwiseConvolutionLayerNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerNative, T>;
+using CpuDepthwiseConvolutionNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, CpuDepthwiseConvolutionNative, T>;
 
 namespace
 {
@@ -124,8 +124,9 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
     auto biases  = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
     auto dst     = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
 
-    NEDepthwiseConvolutionLayerNativeKernel dwc;
-    dwc.configure(&src, &weights, &biases, &dst, pad_stride_info);
+    cpu::kernels::CpuDepthwiseConvolutionNativeKernel dwc;
+    const ConvolutionInfo info{pad_stride_info, 1, ActivationLayerInfo(), Size2D(1, 1)};
+    dwc.configure(src.info(), weights.info(), biases.info(), dst.info(), info);
 
     ARM_COMPUTE_EXPECT(src.info()->padding().empty(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(weights.info()->padding().empty(), framework::LogLevel::ERRORS);
@@ -135,7 +136,7 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_precommit,
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
@@ -152,7 +153,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<fl
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_nightly,
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index e87e31f97b..a0ff2c5ac9 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -78,9 +78,10 @@ public:
 
         _weights_shape = TensorShape(kernel_size.width, kernel_size.height);
 
-        const TensorInfo in_info(_input_shape, 1, _input_data_type);
-        const TensorInfo we_info(_weights_shape, 1, _weights_data_type);
-        _output_shape = compute_depthwise_convolution_shape(in_info, we_info, _pad_stride_info, _depth_multiplier, _dilation);
+        const TensorInfo      in_info(_input_shape, 1, _input_data_type);
+        const TensorInfo      we_info(_weights_shape, 1, _weights_data_type);
+        const ConvolutionInfo info{ _pad_stride_info, _depth_multiplier, _act_info, _dilation };
+        _output_shape = compute_depthwise_convolution_shape(in_info, we_info, info);
 
         _weights_shape.set(2, _output_shape.z());
         _biases_shape = TensorShape(_weights_shape[2]);
@@ -301,7 +302,11 @@ public:
         _target  = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
 
         // Create Depthwise Convolution configure function
-        _dwc.configure(&_src, &_weights, &_biases, &_target, _conv_info, _depth_multiplier, _dilation);
+        const ConvolutionInfo info
+        {
+            _conv_info, _depth_multiplier, ActivationLayerInfo(), _dilation
+        };
+        _dwc.configure(_src.info(), _weights.info(), _biases.info(), _target.info(), info);
 
         ARM_COMPUTE_EXPECT(_src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -329,8 +334,14 @@ public:
         fill(AccessorType(_weights), 1);
         fill(AccessorType(_biases), 2);
 
+        arm_compute::ITensorPack pack;
+        pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_0, &_src);
+        pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &_weights);
+        pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_2, &_biases);
+        pack.add_tensor(arm_compute::TensorType::ACL_DST, &_target);
+
         // Compute function
-        _dwc.run();
+        _dwc.run(pack);
     }
 
     void compute_reference()
@@ -343,9 +354,9 @@ public:
         fill(weights, 1);
         fill(biases, 2);
 
-        const TensorShape dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), _conv_info,
-                                                                          _depth_multiplier, _dilation);
-        _reference = reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation);
+        const ConvolutionInfo info{ _conv_info, _depth_multiplier, ActivationLayerInfo(), _dilation };
+        const TensorShape     dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), info);
+        _reference                      = reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation);
     }
 
 protected:
@@ -485,9 +496,9 @@ public:
         fill(weights, 1);
         fill(biases, 2);
 
-        const TensorShape dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), _conv_info,
-                                                                          _depth_multiplier, _dilation);
-        _reference = reference::activation_layer(reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation), _act_info);
+        const ConvolutionInfo info{ _conv_info, _depth_multiplier, _act_info, _dilation };
+        const TensorShape     dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), info);
+        _reference                      = reference::activation_layer(reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation), _act_info);
     }
 
 protected:
-- 
cgit v1.2.1