From bc4e31113be0af320f44b338969d6972b64ca4de Mon Sep 17 00:00:00 2001 From: SiCongLi Date: Tue, 29 Jun 2021 13:18:30 +0100 Subject: Implement FP GPU depthwise convolution 1x1 kernel for in-place computation * Implement in-place graph node mutator for 1x1 depthwise convolution * Add in-place to validation fixture except for DepthwiseConvolutionLayerNativeValidationFixture as it would be a duplicate test otherwise (DepthwiseConvolutionLayerNative test tests the underlying kernel) Resolves: COMPMID-4432 Change-Id: Id7f10f5ebdce7d49f550c0b62dbaaab7f5b59d29 Signed-off-by: SiCongLi Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5874 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Georgios Pinitas --- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 33 ++++++-- .../CLDepthwiseConvolutionLayerNativeKernel.h | 95 +++++++++------------- src/graph/mutators/InPlaceOperationMutator.cpp | 75 +++++++++++++++++ .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 17 +++- 4 files changed, 156 insertions(+), 64 deletions(-) (limited to 'src') diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 4cc0e462c4..eb1cf146af 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -46,7 +46,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + bool in_place = false; + if(output == nullptr || output == input) + { + in_place = true; + output = input; + } + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); @@ -58,6 +64,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_UNUSED(idx_c); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier)); + // In place restrictions + if(in_place) + { + const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U); + ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1U); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride() != std::make_pair(1U, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON(dilation != Size2D(1U, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it + } + const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); @@ -139,19 +157,24 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel _type = CLKernelType::DEPTHWISE; } -void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, +void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts); } -void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); + if(output == nullptr) + { + // In-place + output = input; + } ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h index 325f4e7067..068131f434 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -49,64 +49,47 @@ public: CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default; /** Initialize the function's source, destination and parameters * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 - * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC + * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. + * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input. + * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread + * @param[in] dwc_info Depthwise convolution layer info + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * + * @note: In-place is only supported when + * * data layout: NHWC + * * filter: 1x1 + * * @p depth_multiplier: 1 + * * strides: 1 + * * dilation: 1 + * * no padding + * * no change of data layout after configure */ - void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U), const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + /** Initialize the function's source, destination and parameters * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 - * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure() */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U), const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, N, M]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor info. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 - * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure() * * @return a status */ @@ -118,14 +101,14 @@ public: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input; - const ICLTensor *_weights; - const ICLTensor *_biases; - ICLTensor *_output; - unsigned int _depth_multiplier; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _is_quantized; + const ICLTensor *_input {}; + const ICLTensor *_weights{}; + const ICLTensor *_biases{}; + ICLTensor *_output{}; + unsigned int _depth_multiplier{ 0 }; + const ICLTensor *_output_multipliers{}; + const ICLTensor *_output_shifts{}; + bool _is_quantized{ false }; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */ diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp index 616ec5c73d..86236e8854 100644 --- a/src/graph/mutators/InPlaceOperationMutator.cpp +++ b/src/graph/mutators/InPlaceOperationMutator.cpp @@ -23,9 +23,15 @@ */ #include "arm_compute/graph/mutators/InPlaceOperationMutator.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/Logger.h" +#include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h" +#include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h" +#include "support/Cast.h" + +using namespace arm_compute::utils::cast; namespace arm_compute { @@ -82,6 +88,69 @@ void set_new_output_and_inherit_accessor(std::unique_ptr &node, Tensor *o node->set_output_tensor(new_output->id(), 0); } +// Try to mutate the node to perform the depthwise in-place calculation +void try_in_place_depthwiseconv(std::unique_ptr &node) +{ + // Get input edge + Edge *input_edge = node->input_edge(0); + Edge *weight_edge = node->input_edge(1); + ARM_COMPUTE_ERROR_ON(input_edge == nullptr || weight_edge == nullptr); + + auto input_tensor = input_edge->tensor(); + auto weight_tensor = weight_edge->tensor(); + ARM_COMPUTE_ERROR_ON(input_tensor == nullptr || weight_tensor == nullptr); + + const auto input_shape = input_tensor->desc().shape; + const auto qinfo_input = input_tensor->desc().quant_info; + + const auto weight_shape = weight_tensor->desc().shape; + const auto weight_layout = weight_tensor->desc().layout; + + // Extract PadStrideInfo and depth multiplier + PadStrideInfo conv_info{}; + unsigned int depth_multiplier{}; + if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer) + { + conv_info = polymorphic_downcast(node.get())->convolution_info(); + depth_multiplier = polymorphic_downcast(node.get())->depth_multiplier(); + } + else if(node->type() == NodeType::DepthwiseConvolutionLayer) + { + conv_info = polymorphic_downcast(node.get())->convolution_info(); + depth_multiplier = polymorphic_downcast(node.get())->depth_multiplier(); + } + + // Get current output tensor + auto current_output_tensor = node->output(0); + ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr); + const auto out_shape = current_output_tensor->desc().shape; + const auto qinfo_out = current_output_tensor->desc().quant_info; + + bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr); + + // Specify conditions with which input can be in-placed + input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC; + + const int weights_width_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::WIDTH); + const int weights_height_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::HEIGHT); + const bool is_1x1 = weight_shape[weights_width_idx] == 1U && weight_shape[weights_height_idx] == 1U; + input_can_in_place &= is_1x1; + + input_can_in_place &= depth_multiplier == 1; + input_can_in_place &= conv_info.stride() == std::make_pair(1U, 1U); + input_can_in_place &= !conv_info.has_padding(); + // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node + + if(input_can_in_place) + { + set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor); + } + else + { + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n"); + } +} + // Try to mutate the node to perform the elementwise in-place calculation void try_in_place_elementwise(std::unique_ptr &node) { @@ -148,6 +217,8 @@ void InPlaceOperationMutator::mutate(Graph &g) NodeType::BatchNormalizationLayer, NodeType::EltwiseLayer, NodeType::UnaryEltwiseLayer, + NodeType::DepthwiseConvolutionLayer, + NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer, NodeType::PrintLayer }; @@ -166,6 +237,10 @@ void InPlaceOperationMutator::mutate(Graph &g) { try_in_place_elementwise(node); } + else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer) + { + try_in_place_depthwiseconv(node); + } else { // Get current and new output tensors diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index c7520cd087..a826f85c5c 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -136,11 +136,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, - output->info(), + output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, @@ -220,6 +220,11 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { + const bool in_place = input == output || output == nullptr; + if(in_place) + { + output = input; + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); @@ -254,6 +259,7 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate if(needs_permute) { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout"); TensorShape permuted_input_shape = input->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; @@ -309,7 +315,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() _output_shifts.map(); quantization::compute_quantized_multipliers_and_shifts(_input->info(), _original_weights->info(), - _output->info(), + _output != nullptr ? _output->info() : _input->info(), reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); @@ -549,6 +555,11 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { + if(output == nullptr) + { + // In-place + output = input; + } _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation); switch(_depth_conv_func) -- cgit v1.2.1