1 files changed, 241 insertions, 425 deletions
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 7214971044..6c085645db 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,542 +28,358 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDepthwiseConv2d.h"
+
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
-    if(biases != nullptr)
-    {
-        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type());
-
-    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
-    {
-        TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation));
-
-        if(is_quantized)
-        {
-            DirectConvolutionLayerOutputStageKernelInfo direct_conv_info;
-            direct_conv_info.output_data_type = input->data_type();
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, direct_conv_info));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
-    }
+NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
 
-    //Validate Activation Layer
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-    return Status{};
-}
-} // namespace
-
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
-      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
-      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
+{
+    ITensor                                 *src{nullptr};       // SRC_0
+    ITensor                                 *dst{nullptr};       // DST_0
+    const ITensor                           *weights{nullptr};   // SRC_1
+    const ITensor                           *biases{nullptr};    // SRC_2
+    Tensor                                   permuted_input{};   // INT_0
+    Tensor                                   permuted_weights{}; // INT_1
+    Tensor                                   permuted_output{};  // INT_2
+    Tensor                                   workspace{};        // INT_3
+    Tensor                                   packed_weights{};   // INT_4
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+    bool                                     is_prepared{false};
+    bool                                     permute{false};
+};
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_generic(ITensor                   *input,
-                                                                                                  const ITensor             *weights,
-                                                                                                  const ITensor             *biases,
-                                                                                                  ITensor                   *output,
-                                                                                                  const PadStrideInfo       &conv_info,
-                                                                                                  unsigned int               depth_multiplier,
-                                                                                                  const ActivationLayerInfo &act_info,
-                                                                                                  const Size2D              &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(
+    ITensor                   *input,
+    const ITensor             *weights,
+    const ITensor             *biases,
+    ITensor                   *output,
+    const PadStrideInfo       &conv_info,
+    unsigned int               depth_multiplier,
+    const ActivationLayerInfo &act_info,
+    const Size2D              &dilation)
 {
-    ARM_COMPUTE_UNUSED(act_info);
-
-    PixelValue zero_value(0.f);
-
-    // Initialize the intermediate accumulator tensor in case of quantized input
-    if(_is_quantized)
-    {
-        TensorShape accum_shape  = output->info()->tensor_shape();
-        DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_nchw)
-        {
-            permute(accum_shape, PermutationVector(1U, 2U, 0U));
-            accum_layout = DataLayout::NCHW;
-        }
-
-        _memory_group.manage(&_accumulator);
-        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
-        _accumulator.info()->set_data_layout(accum_layout);
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
-    }
-
-    if(!_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-        // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-        // Configure depthwise
-        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
+    bool is_nhwc   = input->info()->data_layout() == DataLayout::NCHW;
+    _impl->src     = input;
+    _impl->weights = weights;
+    _impl->biases  = biases;
+    _impl->dst     = output;
+    _impl->permute = is_nhwc;
 
-        // Configure border handler
-        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op->configure(_impl->src->info(), _impl->weights->info(),
+                         _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info);
 
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-    }
-    else
-    {
-        // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
+    // Configure pipeline
+    ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
+    const bool          is_relu                    = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
+    bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
 
-        // Configure border handler
-        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-    }
-
-    // Configure biases accumulation
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
-
-        float   multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-        int32_t output_multiplier;
-        int32_t output_shift;
-        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-
-        DirectConvolutionLayerOutputStageKernelInfo direct_conv_info;
-        direct_conv_info.result_fixedpoint_multiplier = output_multiplier;
-        direct_conv_info.result_shift                 = output_shift;
-        direct_conv_info.result_offset_after_shift    = oq_info.offset;
-        direct_conv_info.output_data_type             = input->info()->data_type();
-        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, direct_conv_info);
-        _accumulator.allocator()->allocate();
-    }
-    else if(_has_bias)
+    if (!is_activationlayer_enabled)
     {
-        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
+        act_info_to_use = act_info;
     }
+    info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation};
 
-    // Permute output
-    if(!_is_nchw)
-    {
-        // Configure the function to transform the convoluted output to NHWC
-        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-        _permuted_output.allocator()->allocate();
-    }
-}
+    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_optimized(const ITensor             *input,
-                                                                                                    const ITensor             *weights,
-                                                                                                    const ITensor             *biases,
-                                                                                                    ITensor                   *output,
-                                                                                                    const PadStrideInfo       &conv_info,
-                                                                                                    unsigned int               depth_multiplier,
-                                                                                                    const ActivationLayerInfo &act_info,
-                                                                                                    const Size2D              &dilation)
-{
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
-    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
-    if(!_is_activationlayer_enabled)
+    if (is_nhwc)
     {
-        act_info_to_use = act_info;
-    }
+        auto permute_input   = std::make_unique<cpu::CpuPermute>();
+        auto permute_weights = std::make_unique<cpu::CpuPermute>();
+        auto permute_output  = std::make_unique<cpu::CpuPermute>();
 
-    if(_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
+        _memory_group.manage(&_impl->permuted_input);
+        _memory_group.manage(&_impl->permuted_weights);
+        _memory_group.manage(&_impl->permuted_output);
 
         // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
 
         // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
 
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         // Configure optimized depthwise
-        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(),
+                                      info);
 
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
 
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
+        _impl->permuted_input.allocator()->allocate();
+        _impl->permuted_output.allocator()->allocate();
     }
     else
     {
-        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
-    }
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
-                                                                                          const ITensor *weights,
-                                                                                          const ITensor *biases,
-                                                                                          ITensor *output, const PadStrideInfo &conv_info,
-                                                                                          unsigned int               depth_multiplier,
-                                                                                          const ActivationLayerInfo &act_info,
-                                                                                          const Size2D              &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
-
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias         = biases != nullptr;
-    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
-                                                                                       weights->info(),
-                                                                                       conv_info,
-                                                                                       depth_multiplier,
-                                                                                       dilation);
-    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
-    _permute                    = _is_optimized == _is_nchw;
-    _is_prepared                = false;
-    _is_activationlayer_enabled = act_info.enabled();
-
-    // Configure appropriate pipeline
-    if(_is_optimized)
-    {
-        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-    else
-    {
-        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-}
-
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
-                                                                                           const ITensorInfo         *weights,
-                                                                                           const ITensorInfo         *biases,
-                                                                                           const ITensorInfo         *output,
-                                                                                           const PadStrideInfo       &conv_info,
-                                                                                           unsigned int               depth_multiplier,
-                                                                                           const ActivationLayerInfo &act_info,
-                                                                                           const Size2D              &dilation)
-{
-    return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_generic()
-{
-    // Fill border
-    NEScheduler::get().schedule(&_border_handler, Window::DimX);
-
-    // Execute depthwise convolution
-    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
-
-    // Add biases
-    if(_has_bias || _is_quantized)
-    {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
+        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
     }
 
-    // Permute output
-    if(!_is_nchw)
-    {
-        _permute_output.run();
-    }
+    // Allocate memory based on the internal memory requirements
+    experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
+    _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8),
+                                       mem_req[0].alignment);
+    _impl->packed_weights.allocator()->init(
+        TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment);
+    _memory_group.manage(&_impl->workspace);
+    _memory_group.manage(&_impl->packed_weights);
+    _impl->workspace.allocator()->allocate();
+    _impl->packed_weights.allocator()->allocate();
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_optimized()
+Status
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo   *input,
+                                                                                    const ITensorInfo   *weights,
+                                                                                    const ITensorInfo   *biases,
+                                                                                    const ITensorInfo   *output,
+                                                                                    const PadStrideInfo &conv_info,
+                                                                                    unsigned int depth_multiplier,
+                                                                                    const ActivationLayerInfo &act_info,
+                                                                                    const Size2D              &dilation)
 {
-    // Run assembly function
-    _dwc_optimized_func.run();
-
-    // Permute output
-    if(_is_nchw)
-    {
-        _permute_output.run();
-    }
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
 {
     prepare();
-
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    // Permute input
-    if(_permute)
-    {
-        _permute_input.run();
-    }
-
-    _is_optimized ? run_optimized() : run_generic();
-
-    // Run activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+    pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+    pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+    pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
+    pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
         // Permute weights
-        if(_permute)
+        if (_impl->permute)
         {
-            _permuted_weights.allocator()->allocate();
-            _permute_weights.run();
-            _original_weights->mark_as_unused();
+            _impl->permuted_weights.allocator()->allocate();
         }
 
-        // Prepare optimized function
-        if(_is_optimized)
+        if (!_impl->permuted_weights.is_used())
         {
-            _dwc_optimized_func.prepare();
-            if(!_permuted_weights.is_used())
-            {
-                _permuted_weights.allocator()->free();
-            }
+            _impl->permuted_weights.allocator()->free();
         }
 
-        _is_prepared = true;
+        _impl->is_prepared = true;
     }
 }
 
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
+{
+    Tensor                                   permuted_input{};
+    Tensor                                   permuted_weights{};
+    Tensor                                   permuted_output{};
+    bool                                     is_prepared{false};
+    bool                                     is_nchw{false};
+    bool                                     is_activationlayer_enabled{false};
+    const ITensor                           *weights{nullptr};
+    const ITensor                           *biases{nullptr};
+    const ITensor                           *src{nullptr};
+    ITensor                                 *dst{nullptr};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+};
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
-    : _depthwise_conv_kernel(), _fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(),
-      _is_prepared(false), _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+    : _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor             *input,
+                                                                                const ITensor       *weights,
+                                                                                const ITensor       *biases,
+                                                                                ITensor             *output,
+                                                                                const PadStrideInfo &conv_info,
+                                                                                unsigned int         depth_multiplier,
+                                                                                const ActivationLayerInfo &act_info,
+                                                                                const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                     output->info(), conv_info, depth_multiplier, act_info, dilation));
 
-    _is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
-    _is_prepared = !_is_nchw;
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
+    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(),
+                         info);
+
+    _impl->src         = input;
+    _impl->dst         = output;
+    _impl->weights     = weights;
+    _impl->biases      = biases;
+    _impl->is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
+    _impl->is_prepared = !_impl->is_nchw;
 
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
-    if(_is_nchw)
+    if (_impl->is_nchw)
     {
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
-        input_to_use = &_permuted_input;
+        auto permute_input   = std::make_unique<cpu::CpuPermute>();
+        auto permute_weights = std::make_unique<cpu::CpuPermute>();
+
+        permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        input_to_use = &_impl->permuted_input;
 
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
-        weights_to_use = &_permuted_weights;
+        permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        weights_to_use = &_impl->permuted_weights;
 
-        _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
-        output_to_use = &_permuted_output;
+        _impl->permuted_output.allocator()->init(
+            output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+        output_to_use = &_impl->permuted_output;
     }
-    _original_weights = weights_to_use;
 
-    _depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
-    _fill_border.configure(input_to_use, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()));
+    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
+    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(),
+                                     biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
-    if(_is_nchw)
+    if (_impl->is_nchw)
     {
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        auto permute_output = std::make_unique<cpu::CpuPermute>();
+        permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
 
-        _permuted_input.allocator()->allocate();
-        _permuted_weights.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        _impl->permuted_input.allocator()->allocate();
+        _impl->permuted_weights.allocator()->allocate();
+        _impl->permuted_output.allocator()->allocate();
     }
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo   *input,
+                                                                                 const ITensorInfo   *weights,
+                                                                                 const ITensorInfo   *biases,
+                                                                                 const ITensorInfo   *output,
                                                                                  const PadStrideInfo &conv_info,
-                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+                                                                                 unsigned int         depth_multiplier,
+                                                                                 const ActivationLayerInfo &act_info,
+                                                                                 const Size2D              &dilation)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
-        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, dilation));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, dilation));
-    }
-
-    // Validate Activation Layer
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-
-    return Status{};
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
 {
-    if(_is_nchw)
-    {
-        prepare();
-        _permute_input.run();
-    }
-
-    NEScheduler::get().schedule(&_fill_border, Window::DimX);
-    NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
-
-    if(_is_nchw)
-    {
-        _permute_output.run();
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::prepare()
-{
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _permute_weights.run();
-        _original_weights->mark_as_unused();
-        _is_prepared = true;
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+    pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+    pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(std::move(memory_manager)), _func_generic()
+    : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                            const ActivationLayerInfo &act_info, const Size2D &dilation)
+#ifndef DOXYGEN_SKIP_THIS
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation);
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+    DepthwiseConvolutionFunction                 depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED};
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr};
+    NEDepthwiseConvolutionLayerGeneric           func_generic{};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{nullptr};
+};
+#endif // DOXYGEN_SKIP_THIS
+
+void NEDepthwiseConvolutionLayer::configure(ITensor                   *input,
+                                            const ITensor             *weights,
+                                            const ITensor             *biases,
+                                            ITensor                   *output,
+                                            const PadStrideInfo       &conv_info,
+                                            unsigned int               depth_multiplier,
+                                            const ActivationLayerInfo &act_info,
+                                            const Size2D              &dilation)
 {
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    switch(depth_conv_func)
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info,
+        depth_multiplier, act_info, dilation));
+
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
+    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info);
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            return NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                            dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            return NEDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                          dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
-DepthwiseConvolutionFunction NEDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                            const PadStrideInfo &conv_info,
-                                                                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo         *input,
+                                             const ITensorInfo         *weights,
+                                             const ITensorInfo         *biases,
+                                             const ITensorInfo         *output,
+                                             const PadStrideInfo       &conv_info,
+                                             unsigned int               depth_multiplier,
+                                             const ActivationLayerInfo &act_info,
+                                             const Size2D              &dilation)
 {
-    if(bool(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)))
-    {
-        return DepthwiseConvolutionFunction::OPTIMIZED;
-    }
-    else
-    {
-        return DepthwiseConvolutionFunction::GENERIC;
-    }
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
 {
-    switch(_depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.run();
+            _impl->func_optimized.run();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.run();
+            _impl->func_generic.run();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
@@ -572,13 +388,13 @@ void NEDepthwiseConvolutionLayer::run()
 
 void NEDepthwiseConvolutionLayer::prepare()
 {
-    switch(_depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.prepare();
+            _impl->func_optimized.prepare();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.prepare();
+            _impl->func_generic.prepare();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");