From a046e164b96a8441b2fa14ef578f7db46a0e97da Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Tue, 8 Oct 2019 09:36:26 +0100
Subject: COMPMID-2600: Implement a new and generic depthwise convolution for
 CL QASYMM8 NHWC

The NCHW case is supported at function level by permuting the
inputs/outputs to NHWC.

This patch also removes CLDirectConvolutionLayerOutputStageKernel which
is deprecated and some kernels which were only used in the generic case
of depthwise convolution.

Change-Id: I91e0f02d0a2f4a4a352e08c248e648944137fe68
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2056
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 arm_compute/core/CL/CLKernels.h                    |  4 -
 .../CLDepthwiseConvolutionLayerNativeKernel.h      |  4 +-
 ...seConvolutionLayerReshapeWeightsGenericKernel.h | 79 -------------------
 .../core/CL/kernels/CLDepthwiseIm2ColKernel.h      | 90 ---------------------
 .../CL/kernels/CLDepthwiseVectorToTensorKernel.h   | 79 -------------------
 .../CLDirectConvolutionLayerOutputStageKernel.h    | 91 ----------------------
 .../CL/functions/CLDepthwiseConvolutionLayer.h     | 47 +++++------
 7 files changed, 21 insertions(+), 373 deletions(-)
 delete mode 100644 arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h
 delete mode 100644 arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
 delete mode 100644 arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h
 delete mode 100644 arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h

(limited to 'arm_compute')

diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 3d9b2c81cd..c3c485db7c 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -56,15 +56,11 @@
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
 #include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLDilateKernel.h"
 #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h"
 #include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/core/CL/kernels/CLErodeKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index f8c841ab6a..31ec871123 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -49,7 +49,7 @@ public:
     CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
     /** Initialize the function's source, destination and parameters
      *
-     * @param[in]  input            Source tensor. Data type supported: FP32/FP16. Data layout supported: NHWC
+     * @param[in]  input            Source tensor. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC
      * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [IFM, N, M]. Data type supported: Same as @p input.
      * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
      *                              Data type supported: Same as @p input.
@@ -64,7 +64,7 @@ public:
                    const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U));
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
-     * @param[in] input            Source tensor info. Data type supported: FP32/FP16. Data layout supported: NHWC
+     * @param[in] input            Source tensor info. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC
      * @param[in] weights          Weights tensor info. A 3D tensor with dimensions [IFM, N, M]. Data type supported: Same as @p input.
      * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
      *                             Data type supported: Same as @p input.
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h
deleted file mode 100644
index 3f969957e1..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSGENERICKERNEL_H__
-#define __ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSGENERICKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depthwise weights reshape kernel.
- *  This kernel reshape original weights' low 2D dimensions into a single row and
- *  have the second dimension as the original depth size.
- *
- **/
-class CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel(const CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel &operator=(const CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel &operator=(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM].
-     *                    Data type supported: QASYMM8/F16/F32.
-     * @param[out] output The output tensor. Data type supported: same as @p input.
-     * @param[in]  biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel
-     *
-     * @param[in] input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM].
-     *                   Data type supported: QASYMM8/F32.
-     * @param[in] output The output tensor. Data type supported: same as @p input.
-     * @param[in] biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-};
-} // arm_compute
-#endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSGENERICKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
deleted file mode 100644
index 15798471a8..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLDEPTHWISEIM2COLKERNEL_H__
-#define __ARM_COMPUTE_CLDEPTHWISEIM2COLKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depthwise im2col reshape kernel.
- *  This kernel reshape the input low 3 dimensions to a new 3D shape  where the output's first dimension is
- *  the linear patch size (FILTER_WIDTH * FILTER_HEIGHT) and second dimension is number of patches per image and third dimension unchanged .
- **/
-class CLDepthwiseIm2ColKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseIm2ColKernel(const CLDepthwiseIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseIm2ColKernel &operator=(const CLDepthwiseIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseIm2ColKernel(CLDepthwiseIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseIm2ColKernel &operator=(CLDepthwiseIm2ColKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input            The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                              while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
-     * @param[out] output           The output tensor. First 3 lower dimensions represent a transform of each 3D input,
-     *                              while every dimension above 3 represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims      The kernel dimensions (width and height).
-     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias         Boolean that specifies if the depthwise convolution has bias.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1,
-                   const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseIm2ColKernel
-     *
-     * @param[in] input            The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F32
-     * @param[in] output           The output tensor info. First 3 lower dimensions represent a transform of each 3D input,
-     *                             while every dimension above 3 represents a batch. Data types supported: Same as @p input
-     * @param[in] kernel_dims      The kernel dimensions (width and height).
-     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias         Boolean that specifies if the depthwise convolution has bias.
-     * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
-                           const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // arm_compute
-#endif /*__ARM_COMPUTE_CLDEPTHWISEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h
deleted file mode 100644
index c9ec8e13bf..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLDEPTHWISEVECTORTOTENSORKERNEL_H__
-#define __ARM_COMPUTE_CLDEPTHWISEVECTORTOTENSORKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depthwise vector to tensor kernel.
- *
- *  This kernel takes the 1D tensor that's been produced by the MatrixVectorMultiply
- *  kernel and reshapes it to given width and height (previously calculated, based
- *  on input/weights dimensions and convolution strides and padding).
- *
- **/
-class CLDepthwiseVectorToTensorKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseVectorToTensorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseVectorToTensorKernel(const CLDepthwiseVectorToTensorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseVectorToTensorKernel &operator=(const CLDepthwiseVectorToTensorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseVectorToTensorKernel(CLDepthwiseVectorToTensorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseVectorToTensorKernel &operator=(CLDepthwiseVectorToTensorKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input vector to convert. Data type supported: QASYMM8/S32/F16/F32.
-     * @param[out] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
-     * @param[in]  conv_w The converted tensor's width.
-     * @param[in]  conv_h The converted tensor's height.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, size_t conv_w, size_t conv_h);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseVectorToTensorKernel
-     *
-     * @param[in] input  The input vector to convert. Data type supported: QASYMM8/S32/F16/F32.
-     * @param[in] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
-     * @param[in] conv_w The converted tensor's width.
-     * @param[in] conv_h The converted tensor's height.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h);
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // arm_compute
-#endif /*__ARM_COMPUTE_CLDEPTHWISEVECTORTOTENSORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h
deleted file mode 100644
index 80bc012d9f..0000000000
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H__
-#define __ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-/** OpenCL kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @deprecated This kernel is deprecated and will be removed in release 19.05
- *
- * @note We assume bias to be shared
- *
- */
-class CLDirectConvolutionLayerOutputStageKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDirectConvolutionLayerOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerOutputStageKernel(const CLDirectConvolutionLayerOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerOutputStageKernel &operator=(const CLDirectConvolutionLayerOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerOutputStageKernel(CLDirectConvolutionLayerOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerOutputStageKernel &operator=(CLDirectConvolutionLayerOutputStageKernel &&) = default;
-    /** Default destructor */
-    ~CLDirectConvolutionLayerOutputStageKernel() = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] input                        Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                                              Data type supported: S32/F16/F32
-     * @param[in]      bias                         (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
-     * @param[out]     output                       (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                                              Required parameter if output is of QASYMM8 type.
-     *                                              Data types supported: QASYMM8/F16/F32
-     * @param[in]      result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]      result_shift                 (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]      result_offset_after_shift    (Optional)Offset to be applied to result before converting it back to QASYMM8
-     */
-    void configure(ICLTensor *input, const ICLTensor *bias = nullptr, ICLTensor *output = nullptr,
-                   int result_fixedpoint_multiplier = 0, int result_shift = 0, int result_offset_after_shift = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerOutputStageKernel
-     *
-     * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                   Data type supported: F16/F32
-     * @param[in] bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
-     * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                         Data type supported: F16/F32
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor       *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    int              _result_fixedpoint_multiplier;
-    int              _result_shift;
-    int              _result_offset_after_shift;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index d177f4505a..98581a21fe 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -26,17 +26,12 @@
 
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -121,17 +116,15 @@ private:
 
 /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
  *
- * -# @ref CLDepthwiseIm2ColKernel
- * -# @ref CLGEMMMatrixVectorMultiplyKernel
- * -# @ref CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel
- * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0)
+ * -# @ref CLDepthwiseConvolutionLayerNativeKernel
+ * -# @ref CLPermute (x 3) if the data layout is NCHW
  *
  */
 class CLDepthwiseConvolutionLayer : public IFunction
 {
 public:
     /** Default constructor */
-    CLDepthwiseConvolutionLayer();
+    CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLDepthwiseConvolutionLayer(const CLDepthwiseConvolutionLayer &) = delete;
     /** Default move constructor */
@@ -177,23 +170,21 @@ public:
     void prepare() override;
 
 private:
-    CLDepthwiseIm2ColKernel                                _im2col_kernel;
-    CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel _weights_reshape_kernel;
-    CLGEMMMatrixVectorMultiplyKernel                       _v2mm_kernel;
-    CLDepthwiseVectorToTensorKernel                        _vector_to_tensor_kernel;
-    CLDirectConvolutionLayerOutputStageKernel              _output_stage_kernel;
-    CLActivationLayer                                      _activationlayer_function;
-    CLFillBorderKernel                                     _v2mm_input_fill_border;
-    CLFillBorderKernel                                     _v2mm_weights_fill_border;
-    CLTensor                                               _input_reshaped;
-    CLTensor                                               _weights_reshaped;
-    CLTensor                                               _v2mm_output;
-    CLTensor                                               _output_reshaped;
-    bool                                                   _is_prepared;
-    bool                                                   _is_quantized;
-    bool                                                   _is_activationlayer_enabled;
-    const ICLTensor                                       *_original_weights;
-    std::unique_ptr<IFunction>                             _optimised_function;
+    MemoryGroup _memory_group;
+
+    std::unique_ptr<IFunction>              _optimised_function;
+    CLDepthwiseConvolutionLayerNativeKernel _dwc_native_kernel;
+    CLPermute                               _permute_input_to_nhwc;
+    CLPermute                               _permute_weights_to_nhwc;
+    CLPermute                               _permute_output_to_nchw;
+
+    CLTensor       _permuted_input;
+    CLTensor       _permuted_weights;
+    CLTensor       _permuted_output;
+    const ITensor *_original_weights;
+
+    bool _needs_permute;
+    bool _is_prepared;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__ */
-- 
cgit v1.2.1