From dcf4c87cf78a5f1667699c1a3511d09356938660 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Fri, 16 Apr 2021 12:41:45 +0100 Subject: CLDepthwiseConvolutionLayer rework - Part 1 Remove the reshaped variant for CLDepthwiseConvolutionLayer 3x3 NHWC Quantized - Remove kernel selection by GPUTarget - Remove unused quantized support from the NHWC kernel - Remove CLDepthwiseConvolutionLayerReshapeWeightsKernel - Remove OpenCL kernels for reshaped dwc 3x3 quantized and weights reshape - Remove the "_bifrost" suffix in common OpenCL kernel - Remove the ICLDepthwiseConvolutionLayer3x3Kernel common interface Resolve COMPMID-3864, COMPMID-3907 Change-Id: Icfac0fb6c00e214985beb05dad7c0cdbbee7d830 Signed-off-by: Giorgio Arena Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5447 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 126 ++++++++------------- 1 file changed, 48 insertions(+), 78 deletions(-) (limited to 'src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp') diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 8e3d010786..6467caffef 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -30,13 +30,9 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" -#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" -#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" -#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h" namespace arm_compute { @@ -46,23 +42,18 @@ using namespace arm_compute::misc::shape_calculator; namespace { Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation) + unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const bool is_nhwc = input->data_layout() == DataLayout::NHWC; - const bool needs_permute = is_nhwc && (depth_multiplier > 1); - const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && is_quantized; - const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); - const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - DepthwiseConvolutionReshapeInfo info; - info.c0 = 4; - info.transpose = is_stride_1_dilation_1 && is_dot8_supported; + const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + const bool is_nhwc = input->data_layout() == DataLayout::NHWC; + const bool needs_permute = is_nhwc && (depth_multiplier > 1); + + ARM_COMPUTE_RETURN_ERROR_ON(is_quantized && is_nhwc && !needs_permute); TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); if(is_quantized) @@ -96,27 +87,17 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW); ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, - conv_info, depth_multiplier, act_info, gpu_target, + conv_info, depth_multiplier, act_info, dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } else if(is_nhwc) { - if(needs_weights_reshape) - { - auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, - output, conv_info, depth_multiplier, act_info, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } return Status{}; @@ -351,12 +332,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), - _kernel(nullptr), + _kernel_nchw(nullptr), + _kernel_nhwc(nullptr), _border_handler(std::make_unique()), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), - _reshape_weights(std::make_unique()), _permuted_input(), _permuted_weights(), _permuted_output(), @@ -366,7 +347,6 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwise _input(nullptr), _output(nullptr), _needs_permute(false), - _needs_weights_reshape(false), _is_prepared(false), _is_quantized(false), _is_nhwc(false) @@ -383,8 +363,6 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { - const GPUTarget gpu_target = CLScheduler::get().target(); - // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(), @@ -394,13 +372,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config conv_info, depth_multiplier, act_info, - gpu_target, dilation)); - _is_nhwc = input->info()->data_layout() == DataLayout::NHWC; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _needs_permute = _is_nhwc && (depth_multiplier > 1); - _needs_weights_reshape = _is_nhwc && (depth_multiplier == 1) && _is_quantized; + _is_nhwc = input->info()->data_layout() == DataLayout::NHWC; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _needs_permute = _is_nhwc && (depth_multiplier > 1); _is_prepared = false; _original_weights = weights; @@ -412,13 +388,6 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config ICLTensor *output_to_use = output; const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); - const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel; - const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); - - DepthwiseConvolutionReshapeInfo info; - info.c0 = 4; - info.transpose = is_stride_1_dilation_1 && is_dot8_supported; if(_needs_permute) { @@ -438,20 +407,15 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config weights_to_use = &_permuted_weights; output_to_use = &_permuted_output; - _kernel = std::make_unique(); + _kernel_nchw = std::make_unique(); } else if(_is_nhwc) { - if(_needs_weights_reshape) - { - _reshape_weights->configure(compile_context, weights, &_permuted_weights, info); - weights_to_use = &_permuted_weights; - } - _kernel = std::make_unique(); + _kernel_nhwc = std::make_unique(); } else { - _kernel = std::make_unique(); + _kernel_nchw = std::make_unique(); } CLTensor *output_multipliers_to_use = nullptr; @@ -469,9 +433,16 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config } // Configure kernel - _kernel->set_target(gpu_target); - _kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, - act_info, dilation, output_multipliers_to_use, output_shifts_to_use); + if(_is_nhwc && !_needs_permute) + { + _kernel_nhwc->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, + act_info, dilation); + } + else + { + _kernel_nchw->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, + act_info, dilation, output_multipliers_to_use, output_shifts_to_use); + } if(_is_quantized) { @@ -496,13 +467,16 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config { zero_value = PixelValue(static_cast(input->info()->quantization_info().uniform().offset)); } - _border_handler->configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value); + if(!_is_nhwc || _needs_permute) + { + _border_handler->configure(compile_context, input_to_use, _kernel_nchw->border_size(), BorderMode::CONSTANT, zero_value); + } } Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation) + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { - return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation); + return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); } void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run() @@ -516,7 +490,14 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run() _permute_input_to_nchw.run(); } CLScheduler::get().enqueue(*_border_handler); - CLScheduler::get().enqueue(*_kernel); + if(_is_nhwc && !_needs_permute) + { + CLScheduler::get().enqueue(*_kernel_nhwc); + } + else + { + CLScheduler::get().enqueue(*_kernel_nchw); + } if(_needs_permute) { @@ -552,14 +533,6 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepar _original_weights->mark_as_unused(); } - if(_needs_weights_reshape) - { - ARM_COMPUTE_ERROR_ON(_needs_permute); - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - _permuted_weights.allocator()->allocate(); - CLScheduler::get().enqueue(*_reshape_weights); - _original_weights->mark_as_unused(); - } _is_prepared = true; } } @@ -580,9 +553,8 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { - const GPUTarget gpu_target = CLScheduler::get().target(); - _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, - dilation, gpu_target); + _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, + dilation); switch(_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: @@ -603,12 +575,11 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { - const GPUTarget gpu_target = CLScheduler::get().target(); - DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, gpu_target); + DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); switch(depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation); + return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); case DepthwiseConvolutionFunction::GENERIC: return CLDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); default: @@ -618,10 +589,9 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe DepthwiseConvolutionFunction CLDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, GPUTarget gpu_target) + unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { - if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation)) && (is_data_type_float(input->data_type()) - || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)) + if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation))) { return DepthwiseConvolutionFunction::OPTIMIZED; } -- cgit v1.2.1