From dcf4c87cf78a5f1667699c1a3511d09356938660 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Fri, 16 Apr 2021 12:41:45 +0100 Subject: CLDepthwiseConvolutionLayer rework - Part 1 Remove the reshaped variant for CLDepthwiseConvolutionLayer 3x3 NHWC Quantized - Remove kernel selection by GPUTarget - Remove unused quantized support from the NHWC kernel - Remove CLDepthwiseConvolutionLayerReshapeWeightsKernel - Remove OpenCL kernels for reshaped dwc 3x3 quantized and weights reshape - Remove the "_bifrost" suffix in common OpenCL kernel - Remove the ICLDepthwiseConvolutionLayer3x3Kernel common interface Resolve COMPMID-3864, COMPMID-3907 Change-Id: Icfac0fb6c00e214985beb05dad7c0cdbbee7d830 Signed-off-by: Giorgio Arena Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5447 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- arm_compute/core/Types.h | 6 --- arm_compute/core/utils/misc/ShapeCalculator.h | 24 ----------- .../CL/functions/CLDepthwiseConvolutionLayer.h | 49 ++++++++++------------ 3 files changed, 23 insertions(+), 56 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index b1f340d18e..b5fd21d29d 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1874,12 +1874,6 @@ struct ConvolutionInfo Size2D dilation{ Size2D(1, 1) }; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */ }; -struct DepthwiseConvolutionReshapeInfo -{ - unsigned int c0{ 1 }; /**< Number of channels processed by the depth-wise convolution */ - bool transpose{ false }; /**< True if the block MxC0 (where M is the area of the filter i.e. KwxKh) has to be transposed */ -}; - /** GEMMLowp output stage type */ enum class GEMMLowpOutputStageType { diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index ba37f9a61e..8e49c068af 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -287,30 +287,6 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte return shape_interleaved_a; } -/** Calculate the reshaped shape of the weights to use in depthwise convolution - * - * @param[in] input Input tensor info - * @param[in] info Depthwise convolution information to be used for reshaping. - * - * @return the calculated shape - */ -inline TensorShape compute_reshaped_depthwise_weights_shape(const ITensorInfo &input, const DepthwiseConvolutionReshapeInfo &info) -{ - const auto data_layout = input.data_layout(); - TensorShape weights_shape{}; - - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const size_t num_channels = input.dimension(channel_idx); - const size_t num_rows = input.dimension(height_idx); - const size_t num_cols = input.dimension(width_idx); - - weights_shape.set(0, num_rows * num_cols * info.c0); - weights_shape.set(1, DIV_CEIL(num_channels, info.c0)); - return weights_shape; -} - /** Calculate the transposed 1xW shape * * @param[in] b Input tensor info diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index e2c5d683cf..1af9e1dc6f 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -35,8 +35,8 @@ namespace arm_compute class CLCompileContext; class CLFillBorderKernel; class CLDepthwiseConvolutionLayerNativeKernel; -class CLDepthwiseConvolutionLayerReshapeWeightsKernel; -class ICLDepthwiseConvolutionLayer3x3Kernel; +class CLDepthwiseConvolutionLayer3x3NCHWKernel; +class CLDepthwiseConvolutionLayer3x3NHWCKernel; class ICLTensor; /** Function to execute a depthwise convolution @@ -123,19 +123,17 @@ private: * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] gpu_target (Optional) GPU target to validate the kernel for. Defaults to midgard. * * @return a Depthwise Convolution Function */ static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, - ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U), GPUTarget gpu_target = GPUTarget::MIDGARD); + ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); /** Basic function to execute a depthwise convolution for kernel size 3x3xC (when data layout NCHW) or Cx3x3 (when data layout NHWC). This function calls the following OpenCL kernels: * * -# @ref CLDepthwiseConvolutionLayer3x3NCHWKernel (if data_layout == NCHW) * -# @ref CLDepthwiseConvolutionLayer3x3NHWCKernel (if data_layout == NHWC) - * -# @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel (if data_layout == NHWC) * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0) * */ @@ -200,7 +198,7 @@ private: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, - ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD, const Size2D &dilation = Size2D(1U, 1U)); + ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overriden: void run() override; @@ -212,26 +210,25 @@ private: }; private: - MemoryGroup _memory_group; - std::unique_ptr _kernel; - std::unique_ptr _border_handler; - CLPermute _permute_input_to_nchw; - CLPermute _permute_weights_to_nchw; - CLPermute _permute_output_to_nhwc; - std::unique_ptr _reshape_weights; - CLTensor _permuted_input; - CLTensor _permuted_weights; - CLTensor _permuted_output; - CLTensor _output_multipliers; - CLTensor _output_shifts; - const ITensor *_original_weights; - const ITensor *_input; - const ITensor *_output; - bool _needs_permute; - bool _needs_weights_reshape; - bool _is_prepared; - bool _is_quantized; - bool _is_nhwc; + MemoryGroup _memory_group; + std::unique_ptr _kernel_nchw; + std::unique_ptr _kernel_nhwc; + std::unique_ptr _border_handler; + CLPermute _permute_input_to_nchw; + CLPermute _permute_weights_to_nchw; + CLPermute _permute_output_to_nhwc; + CLTensor _permuted_input; + CLTensor _permuted_weights; + CLTensor _permuted_output; + CLTensor _output_multipliers; + CLTensor _output_shifts; + const ITensor *_original_weights; + const ITensor *_input; + const ITensor *_output; + bool _needs_permute; + bool _is_prepared; + bool _is_quantized; + bool _is_nhwc; }; /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels: -- cgit v1.2.1