From f401c74a963a1ce2e188cd20269650063c1d483c Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Tue, 12 May 2020 16:18:33 +0100 Subject: COMPMID-3483: Refactor NEDepthwiseConvolutionLayerNativeKernel Removed is_per_channel template arguments since it wasn't used anywhere and also made has_biases a runtime parameter. The total size reduction from this change is 28.6kb. Change-Id: I292ac27ae3ea2885b8438f613390486323982664 Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3189 Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../NEDepthwiseConvolutionLayerNativeKernel.h | 17 +++--- .../NEDepthwiseConvolutionLayerNativeKernel.cpp | 61 ++++++++++------------ 2 files changed, 37 insertions(+), 41 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h index 1303cf9021..9737c9932e 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h @@ -95,22 +95,22 @@ public: BorderSize border_size() const override; private: - template < typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename std::enable_if < std::is_same::value + template < typename T, typename TW, int S, typename std::enable_if < std::is_same::value #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - || std::is_same::value + || std::is_same::value #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - , - int >::type = 0 > - void run_depthwise(const Window &window); + , + int >::type = 0 > + void run_depthwise(const Window &window, bool has_biases); - template < typename T, typename TW, int S, bool has_biases, bool is_per_channel, REQUIRES_TA(std::is_same::value || std::is_same::value) > - void run_depthwise(const Window &window); + template < typename T, typename TW, int S, REQUIRES_TA(std::is_same::value || std::is_same::value) > + void run_depthwise(const Window &window, bool has_biases); /** Common signature for all the specialised depthwise convolution native functions * * @param[in] window Region on which to execute the kernel. */ - using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window); + using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases); DepthwiseFunctionPtr _func; BorderSize _border_size; @@ -123,6 +123,7 @@ private: Size2D _dilation; std::vector _output_multiplier; std::vector _output_shift; + bool _has_biases; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */ diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp index 4639922da7..ef196ab904 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp @@ -46,9 +46,9 @@ void pad_vectors(std::vector &mult, std::vector &shift, int vec_size) } } -template +template void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const Size2D &dilation, const Window &window) + const Size2D &dilation, const Window &window, bool has_biases) { using VectorType = typename wrapper::traits::neon_vector::type; using TagType = typename wrapper::traits::neon_vector::tag_type; @@ -119,9 +119,9 @@ void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, input_it, weights_it, biases_it, output_it); } -template +template void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, const Window &window) + const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases) { const size_t input_stride_y = input->info()->strides_in_bytes().y(); const size_t input_stride_z = input->info()->strides_in_bytes().z(); @@ -203,9 +203,9 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con input_it, weights_it, biases_it, output_it); } -template +template void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const Size2D &dilation, std::vector output_multiplier, std::vector output_shift, const Window &window) + const Size2D &dilation, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) { using VectorType = typename wrapper::traits::neon_vector::type; using TagType = typename wrapper::traits::neon_vector::tag_type; @@ -308,9 +308,9 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w input_it, weights_it, biases_it, output_it); } -template +template void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, std::vector output_multiplier, std::vector output_shift, const Window &window) + const Size2D &dilation, unsigned int depth_multiplier, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) { const size_t input_stride_y = input->info()->strides_in_bytes().y(); const size_t input_stride_z = input->info()->strides_in_bytes().z(); @@ -497,7 +497,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } // namespace NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel() - : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift() + : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases() { } @@ -520,6 +520,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co _depth_multiplier = depth_multiplier; _border_size = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); _dilation = dilation; + _has_biases = (biases != nullptr); if(is_data_type_quantized(_input->info()->data_type())) { @@ -550,38 +551,32 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co switch(_weights->info()->data_type()) { case DataType::QASYMM8: - _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise : - &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; + _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; pad_vectors(_output_multiplier, _output_shift, 8); break; case DataType::QASYMM8_SIGNED: - _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise : - &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; + _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; pad_vectors(_output_multiplier, _output_shift, 8); break; case DataType::QSYMM8_PER_CHANNEL: if(_input->info()->data_type() == DataType::QASYMM8) { - _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise : - &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; + _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; } else { - _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise : - &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; + _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; } pad_vectors(_output_multiplier, _output_shift, 8); break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise : - &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; + _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; pad_vectors(_output_multiplier, _output_shift, 4); break; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: - _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise : - &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; + _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise; pad_vectors(_output_multiplier, _output_shift, 2); break; default: @@ -611,43 +606,43 @@ void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const Th ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - (this->*_func)(window); + (this->*_func)(window, _has_biases); } -template < typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename std::enable_if < std::is_same::value +template < typename T, typename TW, int S, typename std::enable_if < std::is_same::value #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - || std::is_same::value + || std::is_same::value #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - , - int >::type > -void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window) + , + int >::type > +void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); if(_depth_multiplier == 1) { - depthwise_loop_multiplier1_fp(_input, _weights, _biases, _output, _conv_info, _dilation, window); + depthwise_loop_multiplier1_fp(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases); } else { - depthwise_loop_generic_fp(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window); + depthwise_loop_generic_fp(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases); } } -template -void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window) +template +void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); if(_depth_multiplier == 1) { - depthwise_loop_multiplier1_quantized(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window); + depthwise_loop_multiplier1_quantized(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases); } else { - depthwise_loop_generic_quantized(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window); + depthwise_loop_generic_quantized(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases); } } } // namespace arm_compute -- cgit v1.2.1