From 7d0adc602b3a3ff66184632fd388b25384a9bc99 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 4 Sep 2020 15:25:24 +0100 Subject: COMPMID-3151: Remove NEDepthwiseConvolutionLayer3x3Kernel Prefer NEDepthwiseConvolutionLayerNativeKernel as it has a native format of NHWC avoiding extra transformation to the NCHW domain. Signed-off-by: Georgios Pinitas Change-Id: If5d8de11691b8ef7f4c3816941f87417d0c8646b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3930 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 230 +++------------------ 1 file changed, 32 insertions(+), 198 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index cfdf2038b9..915a2830bf 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -58,24 +58,7 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); } - const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type()); - - if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation)) - { - TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation)); - - if(is_quantized) - { - DirectConvolutionLayerOutputStageKernelInfo direct_conv_info; - direct_conv_info.output_data_type = input->data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, direct_conv_info)); - } - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); - } + ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); //Validate Activation Layer if(act_info.enabled()) @@ -87,117 +70,34 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo } // namespace NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), - _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false), - _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false) + : _memory_group(memory_manager), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), + _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_nchw(true), _permute(false), + _is_activationlayer_enabled(false), _is_prepared(false) { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_generic(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ARM_COMPUTE_UNUSED(act_info); - - PixelValue zero_value(0.f); - - // Initialize the intermediate accumulator tensor in case of quantized input - if(_is_quantized) - { - TensorShape accum_shape = output->info()->tensor_shape(); - DataLayout accum_layout = output->info()->data_layout(); - if(!_is_nchw) - { - permute(accum_shape, PermutationVector(1U, 2U, 0U)); - accum_layout = DataLayout::NCHW; - } - - _memory_group.manage(&_accumulator); - _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info())); - _accumulator.info()->set_data_layout(accum_layout); - zero_value = PixelValue(static_cast(input->info()->quantization_info().uniform().offset)); - } - - if(!_is_nchw) - { - _memory_group.manage(&_permuted_input); - _memory_group.manage(&_permuted_output); - - // Configure the function to transform the input tensor from NHWC -> NCHW - _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); - _permuted_input.info()->set_data_layout(DataLayout::NCHW); - - // Configure the function to transform the weights tensor from HWI -> IHW - _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); - _permuted_weights.info()->set_data_layout(DataLayout::NCHW); - _permuted_output.info()->set_quantization_info(output->info()->quantization_info()); - - // Configure depthwise - _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation); - - // Configure border handler - _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value); - - // Allocate tensors - _permuted_input.allocator()->allocate(); - } - else - { - // Configure depthwise convolution kernel - _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation); - - // Configure border handler - _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value); - } - - // Configure biases accumulation - if(_is_quantized) - { - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform(); - - float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale; - int32_t output_multiplier; - int32_t output_shift; - quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - - DirectConvolutionLayerOutputStageKernelInfo direct_conv_info; - direct_conv_info.result_fixedpoint_multiplier = output_multiplier; - direct_conv_info.result_shift = output_shift; - direct_conv_info.result_offset_after_shift = oq_info.offset; - direct_conv_info.output_data_type = input->info()->data_type(); - _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, direct_conv_info); - _accumulator.allocator()->allocate(); - } - else if(_has_bias) - { - _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases); - } + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), + output->info(), conv_info, depth_multiplier, act_info, dilation)); - // Permute output - if(!_is_nchw) - { - // Configure the function to transform the convoluted output to NHWC - _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); - _permuted_output.allocator()->allocate(); - } -} + _original_weights = weights; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _has_bias = biases != nullptr; + _is_nchw = input->info()->data_layout() == DataLayout::NCHW; + _permute = _is_nchw; + _is_prepared = false; + _is_activationlayer_enabled = act_info.enabled(); -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_optimized(const ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) -{ + // Configure pipeline ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info); const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); @@ -238,43 +138,6 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: { _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation); } -} - -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), - output->info(), conv_info, depth_multiplier, act_info, dilation)); - - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _has_bias = biases != nullptr; - _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(), - weights->info(), - conv_info, - depth_multiplier, - dilation); - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _permute = _is_optimized == _is_nchw; - _is_prepared = false; - _is_activationlayer_enabled = act_info.enabled(); - - // Configure appropriate pipeline - if(_is_optimized) - { - configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - } - else - { - configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - } // Configure activation if(_is_activationlayer_enabled) @@ -295,29 +158,18 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_generic() +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run() { - // Fill border - NEScheduler::get().schedule(&_border_handler, Window::DimX); - - // Execute depthwise convolution - NEScheduler::get().schedule(&_dwc_kernel, Window::DimX); + prepare(); - // Add biases - if(_has_bias || _is_quantized) - { - NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX); - } + MemoryGroupResourceScope scope_mg(_memory_group); - // Permute output - if(!_is_nchw) + // Permute input + if(_permute) { - _permute_output.run(); + _permute_input.run(); } -} -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_optimized() -{ // Run assembly function _dwc_optimized_func.run(); @@ -326,21 +178,6 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: { _permute_output.run(); } -} - -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Permute input - if(_permute) - { - _permute_input.run(); - } - - _is_optimized ? run_optimized() : run_generic(); // Run activation if(_is_activationlayer_enabled) @@ -362,13 +199,10 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: } // Prepare optimized function - if(_is_optimized) + _dwc_optimized_func.prepare(); + if(!_permuted_weights.is_used()) { - _dwc_optimized_func.prepare(); - if(!_permuted_weights.is_used()) - { - _permuted_weights.allocator()->free(); - } + _permuted_weights.allocator()->free(); } _is_prepared = true; -- cgit v1.2.1