From 6d109965f3641056bb8164dc8450a7327e76e939 Mon Sep 17 00:00:00 2001 From: giuros01 Date: Mon, 7 Jan 2019 17:47:19 +0000 Subject: COMPMID-1691: Optimize CLDepthwiseConvolutionKernel (QASYMM8/NHWC) for 3x3 kernels (stride=1 and stride=2) Change-Id: I7d0d2dc350feeb40d253d17f9ffd5051a8fb42ef Reviewed-on: https://review.mlplatform.org/511 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins --- .../CL/functions/CLDepthwiseConvolutionLayer.h | 44 ++++++++++++---------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h') diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index 60dddbb853..23034c2b7c 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -26,9 +26,10 @@ #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" +#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h" +#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h" #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h" #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h" @@ -48,6 +49,7 @@ class ICLTensor; * * -# @ref CLDepthwiseConvolutionLayer3x3NCHWKernel (if data_layout == NCHW) * -# @ref CLDepthwiseConvolutionLayer3x3NHWCKernel (if data_layout == NHWC) + * -# @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel (if data_layout == NHWC) * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0) * */ @@ -105,11 +107,13 @@ private: CLPermute _permute_input_to_nchw; CLPermute _permute_weights_to_nchw; CLPermute _permute_output_to_nhwc; + CLDepthwiseConvolutionLayerReshapeWeightsKernel _reshape_weights; CLTensor _permuted_input; CLTensor _permuted_weights; CLTensor _permuted_output; const ITensor *_original_weights; bool _needs_permute; + bool _needs_weights_reshape; bool _is_prepared; }; @@ -117,7 +121,7 @@ private: * * -# @ref CLDepthwiseIm2ColKernel * -# @ref CLGEMMMatrixVectorMultiplyKernel - * -# @ref CLDepthwiseWeightsReshapeKernel + * -# @ref CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0) * */ @@ -169,23 +173,23 @@ public: void prepare() override; private: - CLDepthwiseIm2ColKernel _im2col_kernel; - CLDepthwiseWeightsReshapeKernel _weights_reshape_kernel; - CLGEMMMatrixVectorMultiplyKernel _v2mm_kernel; - CLDepthwiseVectorToTensorKernel _vector_to_tensor_kernel; - CLDirectConvolutionLayerOutputStageKernel _output_stage_kernel; - CLActivationLayer _activationlayer_function; - CLFillBorderKernel _v2mm_input_fill_border; - CLFillBorderKernel _v2mm_weights_fill_border; - CLTensor _input_reshaped; - CLTensor _weights_reshaped; - CLTensor _v2mm_output; - CLTensor _output_reshaped; - bool _is_prepared; - bool _is_quantized; - bool _is_activationlayer_enabled; - const ICLTensor *_original_weights; - std::unique_ptr _optimised_function; + CLDepthwiseIm2ColKernel _im2col_kernel; + CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel _weights_reshape_kernel; + CLGEMMMatrixVectorMultiplyKernel _v2mm_kernel; + CLDepthwiseVectorToTensorKernel _vector_to_tensor_kernel; + CLDirectConvolutionLayerOutputStageKernel _output_stage_kernel; + CLActivationLayer _activationlayer_function; + CLFillBorderKernel _v2mm_input_fill_border; + CLFillBorderKernel _v2mm_weights_fill_border; + CLTensor _input_reshaped; + CLTensor _weights_reshaped; + CLTensor _v2mm_output; + CLTensor _output_reshaped; + bool _is_prepared; + bool _is_quantized; + bool _is_activationlayer_enabled; + const ICLTensor *_original_weights; + std::unique_ptr _optimised_function; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__ */ -- cgit v1.2.1