From dcf4c87cf78a5f1667699c1a3511d09356938660 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Fri, 16 Apr 2021 12:41:45 +0100
Subject: CLDepthwiseConvolutionLayer rework - Part 1

Remove the reshaped variant for CLDepthwiseConvolutionLayer 3x3 NHWC Quantized

- Remove kernel selection by GPUTarget
- Remove unused quantized support from the NHWC kernel
- Remove CLDepthwiseConvolutionLayerReshapeWeightsKernel
- Remove OpenCL kernels for reshaped dwc 3x3 quantized and weights reshape
- Remove the "_bifrost" suffix in common OpenCL kernel
- Remove the ICLDepthwiseConvolutionLayer3x3Kernel common interface

Resolve COMPMID-3864, COMPMID-3907

Change-Id: Icfac0fb6c00e214985beb05dad7c0cdbbee7d830
Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5447
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/Types.h                           |  6 ---
 arm_compute/core/utils/misc/ShapeCalculator.h      | 24 -----------
 .../CL/functions/CLDepthwiseConvolutionLayer.h     | 49 ++++++++++------------
 3 files changed, 23 insertions(+), 56 deletions(-)

(limited to 'arm_compute')

diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index b1f340d18e..b5fd21d29d 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1874,12 +1874,6 @@ struct ConvolutionInfo
     Size2D              dilation{ Size2D(1, 1) }; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */
 };
 
-struct DepthwiseConvolutionReshapeInfo
-{
-    unsigned int c0{ 1 };            /**< Number of channels processed by the depth-wise convolution */
-    bool         transpose{ false }; /**< True if the block MxC0 (where M is the area of the filter i.e. KwxKh) has to be transposed */
-};
-
 /** GEMMLowp output stage type */
 enum class GEMMLowpOutputStageType
 {
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index ba37f9a61e..8e49c068af 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -287,30 +287,6 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
     return shape_interleaved_a;
 }
 
-/** Calculate the reshaped shape of the weights to use in depthwise convolution
- *
- * @param[in] input Input tensor info
- * @param[in] info  Depthwise convolution information to be used for reshaping.
- *
- * @return the calculated shape
- */
-inline TensorShape compute_reshaped_depthwise_weights_shape(const ITensorInfo &input, const DepthwiseConvolutionReshapeInfo &info)
-{
-    const auto  data_layout = input.data_layout();
-    TensorShape weights_shape{};
-
-    const int    width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int    channel_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const size_t num_channels = input.dimension(channel_idx);
-    const size_t num_rows     = input.dimension(height_idx);
-    const size_t num_cols     = input.dimension(width_idx);
-
-    weights_shape.set(0, num_rows * num_cols * info.c0);
-    weights_shape.set(1, DIV_CEIL(num_channels, info.c0));
-    return weights_shape;
-}
-
 /** Calculate the transposed 1xW shape
  *
  * @param[in] b Input tensor info
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index e2c5d683cf..1af9e1dc6f 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -35,8 +35,8 @@ namespace arm_compute
 class CLCompileContext;
 class CLFillBorderKernel;
 class CLDepthwiseConvolutionLayerNativeKernel;
-class CLDepthwiseConvolutionLayerReshapeWeightsKernel;
-class ICLDepthwiseConvolutionLayer3x3Kernel;
+class CLDepthwiseConvolutionLayer3x3NCHWKernel;
+class CLDepthwiseConvolutionLayer3x3NHWCKernel;
 class ICLTensor;
 
 /** Function to execute a depthwise convolution
@@ -123,19 +123,17 @@ private:
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
      * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] gpu_target       (Optional) GPU target to validate the kernel for. Defaults to midgard.
      *
      * @return a Depthwise Convolution Function
      */
     static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
                                                                           const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                                                                          ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U), GPUTarget gpu_target = GPUTarget::MIDGARD);
+                                                                          ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
 
     /** Basic function to execute a depthwise convolution for kernel size 3x3xC (when data layout NCHW) or Cx3x3 (when data layout NHWC). This function calls the following OpenCL kernels:
     *
     * -# @ref CLDepthwiseConvolutionLayer3x3NCHWKernel (if data_layout == NCHW)
     * -# @ref CLDepthwiseConvolutionLayer3x3NHWCKernel (if data_layout == NHWC)
-    * -# @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel (if data_layout == NHWC)
     * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0)
     *
     */
@@ -200,7 +198,7 @@ private:
          * @return a status
          */
         static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                               ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD, const Size2D &dilation = Size2D(1U, 1U));
+                               ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
@@ -212,26 +210,25 @@ private:
         };
 
     private:
-        MemoryGroup                                                      _memory_group;
-        std::unique_ptr<ICLDepthwiseConvolutionLayer3x3Kernel>           _kernel;
-        std::unique_ptr<CLFillBorderKernel>                              _border_handler;
-        CLPermute                                                        _permute_input_to_nchw;
-        CLPermute                                                        _permute_weights_to_nchw;
-        CLPermute                                                        _permute_output_to_nhwc;
-        std::unique_ptr<CLDepthwiseConvolutionLayerReshapeWeightsKernel> _reshape_weights;
-        CLTensor                                                         _permuted_input;
-        CLTensor                                                         _permuted_weights;
-        CLTensor                                                         _permuted_output;
-        CLTensor                                                         _output_multipliers;
-        CLTensor                                                         _output_shifts;
-        const ITensor                                                   *_original_weights;
-        const ITensor                                                   *_input;
-        const ITensor                                                   *_output;
-        bool                                                             _needs_permute;
-        bool                                                             _needs_weights_reshape;
-        bool                                                             _is_prepared;
-        bool                                                             _is_quantized;
-        bool                                                             _is_nhwc;
+        MemoryGroup                                               _memory_group;
+        std::unique_ptr<CLDepthwiseConvolutionLayer3x3NCHWKernel> _kernel_nchw;
+        std::unique_ptr<CLDepthwiseConvolutionLayer3x3NHWCKernel> _kernel_nhwc;
+        std::unique_ptr<CLFillBorderKernel>                       _border_handler;
+        CLPermute                                                 _permute_input_to_nchw;
+        CLPermute                                                 _permute_weights_to_nchw;
+        CLPermute                                                 _permute_output_to_nhwc;
+        CLTensor                                                  _permuted_input;
+        CLTensor                                                  _permuted_weights;
+        CLTensor                                                  _permuted_output;
+        CLTensor                                                  _output_multipliers;
+        CLTensor                                                  _output_shifts;
+        const ITensor                                            *_original_weights;
+        const ITensor                                            *_input;
+        const ITensor                                            *_output;
+        bool                                                      _needs_permute;
+        bool                                                      _is_prepared;
+        bool                                                      _is_quantized;
+        bool                                                      _is_nhwc;
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
-- 
cgit v1.2.1