From bc4e31113be0af320f44b338969d6972b64ca4de Mon Sep 17 00:00:00 2001 From: SiCongLi Date: Tue, 29 Jun 2021 13:18:30 +0100 Subject: Implement FP GPU depthwise convolution 1x1 kernel for in-place computation * Implement in-place graph node mutator for 1x1 depthwise convolution * Add in-place to validation fixture except for DepthwiseConvolutionLayerNativeValidationFixture as it would be a duplicate test otherwise (DepthwiseConvolutionLayerNative test tests the underlying kernel) Resolves: COMPMID-4432 Change-Id: Id7f10f5ebdce7d49f550c0b62dbaaab7f5b59d29 Signed-off-by: SiCongLi Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5874 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Georgios Pinitas --- .../CL/functions/CLDepthwiseConvolutionLayer.h | 35 +++----- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 33 ++++++-- .../CLDepthwiseConvolutionLayerNativeKernel.h | 95 +++++++++------------- src/graph/mutators/InPlaceOperationMutator.cpp | 75 +++++++++++++++++ .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 17 +++- tests/datasets/DepthwiseConvolutionLayerDataset.h | 19 +++++ tests/validation/CL/DepthwiseConvolutionLayer.cpp | 30 ++++++- .../fixtures/DepthwiseConvolutionLayerFixture.h | 83 +++++++++++++------ 8 files changed, 272 insertions(+), 115 deletions(-) diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index f31a17d9cb..bbb00a1ebc 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -72,48 +72,33 @@ public: * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | * + * @param[in] compile_context The compile context to be used. * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * + * @note: For in-place support, please check @ref CLDepthwiseConvolutionLayerNativeKernel */ - void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, - ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + /** Initialize the function's source, destination, weights and convolution information. * - * @param[in] compile_context The compile context to be used. - * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW - * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Data type supported: same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * Similar to @ref CLDepthwiseConvolutionLayer::configure() */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor. Data type supported: same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * Similar to @ref CLDepthwiseConvolutionLayer::configure() * * @return a status */ diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 4cc0e462c4..eb1cf146af 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -46,7 +46,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + bool in_place = false; + if(output == nullptr || output == input) + { + in_place = true; + output = input; + } + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); @@ -58,6 +64,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_UNUSED(idx_c); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier)); + // In place restrictions + if(in_place) + { + const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U); + ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1U); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride() != std::make_pair(1U, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON(dilation != Size2D(1U, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it + } + const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); @@ -139,19 +157,24 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel _type = CLKernelType::DEPTHWISE; } -void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, +void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts); } -void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); + if(output == nullptr) + { + // In-place + output = input; + } ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h index 325f4e7067..068131f434 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -49,64 +49,47 @@ public: CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default; /** Initialize the function's source, destination and parameters * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 - * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC + * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. + * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input. + * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread + * @param[in] dwc_info Depthwise convolution layer info + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * + * @note: In-place is only supported when + * * data layout: NHWC + * * filter: 1x1 + * * @p depth_multiplier: 1 + * * strides: 1 + * * dilation: 1 + * * no padding + * * no change of data layout after configure */ - void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U), const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + /** Initialize the function's source, destination and parameters * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 - * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure() */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U), const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, N, M]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. - * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor info. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 - * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, - * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure() * * @return a status */ @@ -118,14 +101,14 @@ public: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input; - const ICLTensor *_weights; - const ICLTensor *_biases; - ICLTensor *_output; - unsigned int _depth_multiplier; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _is_quantized; + const ICLTensor *_input {}; + const ICLTensor *_weights{}; + const ICLTensor *_biases{}; + ICLTensor *_output{}; + unsigned int _depth_multiplier{ 0 }; + const ICLTensor *_output_multipliers{}; + const ICLTensor *_output_shifts{}; + bool _is_quantized{ false }; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */ diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp index 616ec5c73d..86236e8854 100644 --- a/src/graph/mutators/InPlaceOperationMutator.cpp +++ b/src/graph/mutators/InPlaceOperationMutator.cpp @@ -23,9 +23,15 @@ */ #include "arm_compute/graph/mutators/InPlaceOperationMutator.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/Logger.h" +#include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h" +#include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h" +#include "support/Cast.h" + +using namespace arm_compute::utils::cast; namespace arm_compute { @@ -82,6 +88,69 @@ void set_new_output_and_inherit_accessor(std::unique_ptr &node, Tensor *o node->set_output_tensor(new_output->id(), 0); } +// Try to mutate the node to perform the depthwise in-place calculation +void try_in_place_depthwiseconv(std::unique_ptr &node) +{ + // Get input edge + Edge *input_edge = node->input_edge(0); + Edge *weight_edge = node->input_edge(1); + ARM_COMPUTE_ERROR_ON(input_edge == nullptr || weight_edge == nullptr); + + auto input_tensor = input_edge->tensor(); + auto weight_tensor = weight_edge->tensor(); + ARM_COMPUTE_ERROR_ON(input_tensor == nullptr || weight_tensor == nullptr); + + const auto input_shape = input_tensor->desc().shape; + const auto qinfo_input = input_tensor->desc().quant_info; + + const auto weight_shape = weight_tensor->desc().shape; + const auto weight_layout = weight_tensor->desc().layout; + + // Extract PadStrideInfo and depth multiplier + PadStrideInfo conv_info{}; + unsigned int depth_multiplier{}; + if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer) + { + conv_info = polymorphic_downcast(node.get())->convolution_info(); + depth_multiplier = polymorphic_downcast(node.get())->depth_multiplier(); + } + else if(node->type() == NodeType::DepthwiseConvolutionLayer) + { + conv_info = polymorphic_downcast(node.get())->convolution_info(); + depth_multiplier = polymorphic_downcast(node.get())->depth_multiplier(); + } + + // Get current output tensor + auto current_output_tensor = node->output(0); + ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr); + const auto out_shape = current_output_tensor->desc().shape; + const auto qinfo_out = current_output_tensor->desc().quant_info; + + bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr); + + // Specify conditions with which input can be in-placed + input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC; + + const int weights_width_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::WIDTH); + const int weights_height_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::HEIGHT); + const bool is_1x1 = weight_shape[weights_width_idx] == 1U && weight_shape[weights_height_idx] == 1U; + input_can_in_place &= is_1x1; + + input_can_in_place &= depth_multiplier == 1; + input_can_in_place &= conv_info.stride() == std::make_pair(1U, 1U); + input_can_in_place &= !conv_info.has_padding(); + // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node + + if(input_can_in_place) + { + set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor); + } + else + { + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n"); + } +} + // Try to mutate the node to perform the elementwise in-place calculation void try_in_place_elementwise(std::unique_ptr &node) { @@ -148,6 +217,8 @@ void InPlaceOperationMutator::mutate(Graph &g) NodeType::BatchNormalizationLayer, NodeType::EltwiseLayer, NodeType::UnaryEltwiseLayer, + NodeType::DepthwiseConvolutionLayer, + NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer, NodeType::PrintLayer }; @@ -166,6 +237,10 @@ void InPlaceOperationMutator::mutate(Graph &g) { try_in_place_elementwise(node); } + else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer) + { + try_in_place_depthwiseconv(node); + } else { // Get current and new output tensors diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index c7520cd087..a826f85c5c 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -136,11 +136,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, - output->info(), + output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, @@ -220,6 +220,11 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { + const bool in_place = input == output || output == nullptr; + if(in_place) + { + output = input; + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); @@ -254,6 +259,7 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate if(needs_permute) { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout"); TensorShape permuted_input_shape = input->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; @@ -309,7 +315,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() _output_shifts.map(); quantization::compute_quantized_multipliers_and_shifts(_input->info(), _original_weights->info(), - _output->info(), + _output != nullptr ? _output->info() : _input->info(), reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); @@ -549,6 +555,11 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { + if(output == nullptr) + { + // In-place + output = input; + } _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation); switch(_depth_conv_func) diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h index a19e7ee8cf..53b5248374 100644 --- a/tests/datasets/DepthwiseConvolutionLayerDataset.h +++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h @@ -262,6 +262,25 @@ public: add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 4, 4, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U)); } }; + +/** Dataset containing in-place 1x1 depthwise convolution shapes. + * + * For a depthwise convolution op to be in-place: + * * Output has the same shape as the input; + * * 1x1 filter + * * stride == 1 + * * dilations == 1 + * * No paddings +*/ +class SmallInPlaceDepthwiseConvolutionLayerDataset final : public DepthwiseConvolutionLayerDataset +{ +public: + SmallInPlaceDepthwiseConvolutionLayerDataset() + { + add_config(TensorShape(7U, 7U, 1U), Size2D(1U, 1U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(11U, 13U, 16U), Size2D(1U, 1U), PadStrideInfo(1, 1, 0, 0)); + } +}; } // namespace datasets } // namespace test } // namespace arm_compute diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp index c88f7c1624..22922f41a2 100644 --- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp @@ -156,6 +156,8 @@ template using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture; template using CLDepthwiseConvolutionLayerMixedDataLayoutFixture = DepthwiseConvolutionLayerValidationFixture; +template +using CLDepthwiseConvolutionLayerInPlaceFixture = DepthwiseConvolutionLayerValidationFixture; TEST_SUITE(Float) TEST_SUITE(FP16) @@ -290,6 +292,19 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture, f } TEST_SUITE_END() // Dilation TEST_SUITE_END() // Generic + +TEST_SUITE(InPlace) +FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(), + framework::dataset::make("DepthMultiplier", { 1 })), + framework::dataset::make("DataType", + DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_src), _reference, tolerance_f16, tolerance_num); +} +TEST_SUITE_END() // InPlace TEST_SUITE_END() // FP16 TEST_SUITE(FP32) @@ -355,7 +370,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerMixedD framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", DataLayout::NHWC)), - framework::dataset::make("ActivationInfo", ActivationLayerInfo()))) + framework::dataset::make("ActivationInfo", ActivationLayerInfo()))) { validate(CLAccessor(_target), _reference, tolerance_f32); } @@ -436,6 +451,19 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture, } TEST_SUITE_END() // Dilation TEST_SUITE_END() // Generic + +TEST_SUITE(InPlace) +FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(), + framework::dataset::make("DepthMultiplier", { 1 })), + framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_src), _reference, tolerance_f32); +} +TEST_SUITE_END() // InPlace TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h index 19ec6b2560..c255cc5c13 100644 --- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h +++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h @@ -59,8 +59,9 @@ public: void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, Size2D dilation, unsigned int depth_multiplier, DataType input_data_type, DataType weights_data_type, QuantizationInfo input_quantization_info, QuantizationInfo weights_quantization_info, QuantizationInfo output_quantization_info, - DataLayout data_layout, ActivationLayerInfo act_info, bool mixed_layout = false) + DataLayout data_layout, ActivationLayerInfo act_info, bool mixed_layout = false, bool in_place = false) { + ARM_COMPUTE_ERROR_ON(mixed_layout && in_place); _mixed_layout = mixed_layout; _input_shape = in_shape; _input_data_type = input_data_type; @@ -73,6 +74,7 @@ public: _act_info = act_info; _depth_multiplier = depth_multiplier; _dilation = dilation; + _in_place = in_place; _bias_data_type = is_data_type_quantized(_input_data_type) ? DataType::S32 : _input_data_type; @@ -101,13 +103,18 @@ public: } // Create tensors - _src = create_tensor(input_shape, _input_data_type, 1, _input_quantization_info, _data_layout); - _weights = create_tensor(weights_shape, _weights_data_type, 1, _weights_quantization_info, _data_layout); - _biases = create_tensor(_biases_shape, _bias_data_type, 1, _input_quantization_info, _data_layout); - _target = create_tensor(output_shape, _input_data_type, 1, _output_quantization_info, _data_layout); + _src = create_tensor(input_shape, _input_data_type, 1, _input_quantization_info, _data_layout); + _weights = create_tensor(weights_shape, _weights_data_type, 1, _weights_quantization_info, _data_layout); + _biases = create_tensor(_biases_shape, _bias_data_type, 1, _input_quantization_info, _data_layout); + TensorType *target_to_use = nullptr; + if(!_in_place) + { + _target = create_tensor(output_shape, _input_data_type, 1, _output_quantization_info, _data_layout); + target_to_use = &_target; + } // Create Depthwise Convolution configure function - _dwc.configure(&_src, &_weights, &_biases, &_target, _pad_stride_info, _depth_multiplier, _act_info, _dilation); + _dwc.configure(&_src, &_weights, &_biases, target_to_use, _pad_stride_info, _depth_multiplier, _act_info, _dilation); ARM_COMPUTE_ASSERT(_src.info()->is_resizable()); ARM_COMPUTE_ASSERT(_weights.info()->is_resizable()); @@ -117,18 +124,26 @@ public: void allocate_and_run_target() { - add_padding_x({ &_src, &_weights, &_biases, &_target }, _data_layout); + add_padding_x({ &_src, &_weights, &_biases }, _data_layout); + if(!_in_place) + { + add_padding_x({ &_target }, _data_layout); + } // Allocate tensors _src.allocator()->allocate(); _weights.allocator()->allocate(); _biases.allocator()->allocate(); - _target.allocator()->allocate(); ARM_COMPUTE_ASSERT(!_src.info()->is_resizable()); ARM_COMPUTE_ASSERT(!_weights.info()->is_resizable()); ARM_COMPUTE_ASSERT(!_biases.info()->is_resizable()); - ARM_COMPUTE_ASSERT(!_target.info()->is_resizable()); + + if(!_in_place) + { + _target.allocator()->allocate(); + ARM_COMPUTE_ASSERT(!_target.info()->is_resizable()); + } // Fill tensors fill(AccessorType(_src), 0); @@ -163,6 +178,7 @@ public: protected: void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst) { + ARM_COMPUTE_ERROR_ON(_in_place); // Test Multi DataLayout graph cases, when the data layout changes after configure src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW); dst.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW); @@ -240,9 +256,10 @@ protected: unsigned int _depth_multiplier{}; Size2D _dilation{}; bool _mixed_layout{ false }; + bool _in_place{ false }; }; -template +template class DepthwiseConvolutionLayerValidationFixture : public DepthwiseConvolutionLayerValidationGenericFixture { public: @@ -252,7 +269,7 @@ public: { DepthwiseConvolutionLayerValidationGenericFixture::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier, data_type, data_type, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), - data_layout, act_info, mixed_layout); + data_layout, act_info, mixed_layout, in_place); } }; @@ -393,7 +410,7 @@ protected: unsigned int _depth_multiplier{}; }; -template +template class DepthwiseConvolutionLayerNativeConfigurableValidationFixture : public DepthwiseConvolutionLayerValidationGenericFixture { public: @@ -407,6 +424,7 @@ public: _data_layout = data_layout; _act_info = act_info; _n0 = n0; + _in_place = in_place; _input_shape = TensorShape(width, height, channel, batch); _weights_shape = TensorShape(kernel_size.width, kernel_size.height, channel * _depth_multiplier); @@ -434,10 +452,15 @@ public: } // Create tensors - _src = create_tensor(input_shape, _data_type, 1, QuantizationInfo(), _data_layout); - _weights = create_tensor(weights_shape, _data_type, 1, QuantizationInfo(), _data_layout); - _biases = create_tensor(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout); - _target = create_tensor(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout); + _src = create_tensor(input_shape, _data_type, 1, QuantizationInfo(), _data_layout); + _weights = create_tensor(weights_shape, _data_type, 1, QuantizationInfo(), _data_layout); + _biases = create_tensor(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout); + TensorType *target_to_use = nullptr; + if(!_in_place) + { + _target = create_tensor(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout); + target_to_use = &_target; + } DWCWeightsKernelInfo dwc_weights_info; dwc_weights_info.n0 = _n0; @@ -446,7 +469,7 @@ public: dwc_info.activation_info = _act_info; // Create Depthwise Convolution configure function - _dwc.configure(&_src, &_weights, &_biases, &_target, dwc_weights_info, dwc_info, _conv_info, _depth_multiplier, _dilation); + _dwc.configure(&_src, &_weights, &_biases, target_to_use, dwc_weights_info, dwc_info, _conv_info, _depth_multiplier, _dilation); ARM_COMPUTE_ASSERT(_src.info()->is_resizable()); ARM_COMPUTE_ASSERT(_weights.info()->is_resizable()); @@ -462,12 +485,15 @@ public: _src.allocator()->allocate(); _weights.allocator()->allocate(); _biases.allocator()->allocate(); - _target.allocator()->allocate(); ARM_COMPUTE_ASSERT(!_src.info()->is_resizable()); ARM_COMPUTE_ASSERT(!_weights.info()->is_resizable()); ARM_COMPUTE_ASSERT(!_biases.info()->is_resizable()); - ARM_COMPUTE_ASSERT(!_target.info()->is_resizable()); + if(!_in_place) + { + _target.allocator()->allocate(); + ARM_COMPUTE_ASSERT(!_target.info()->is_resizable()); + } // Fill tensors fill(AccessorType(_src), 0); @@ -476,13 +502,19 @@ public: // Test Multi DataLayout graph cases, when the data layout changes after configure _src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW); - _target.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW); + if(!_in_place) + { + _target.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW); + } // Compute function _dwc.run(); // Reinstating original data layout for the test suite to properly check the values - _target.info()->set_data_layout(_data_layout); + if(!_in_place) + { + _target.info()->set_data_layout(_data_layout); + } } void compute_reference() @@ -541,9 +573,10 @@ protected: Size2D _dilation{}; unsigned int _depth_multiplier{}; unsigned int _n0{}; + bool _in_place{ false }; }; -template +template class DepthwiseConvolutionLayerValidationQuantizedFixture : public DepthwiseConvolutionLayerValidationGenericFixture { public: @@ -553,11 +586,11 @@ public: { DepthwiseConvolutionLayerValidationGenericFixture::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier, data_type, data_type, input_quantization_info, input_quantization_info, output_quantization_info, - data_layout, act_info, mixed_layout); + data_layout, act_info, mixed_layout, in_place); } }; -template +template class DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture : public DepthwiseConvolutionLayerValidationGenericFixture { public: @@ -579,7 +612,7 @@ public: DepthwiseConvolutionLayerValidationGenericFixture::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier, input_data_type, weights_data_type, input_quantization_info, QuantizationInfo(weights_scales), output_quantization_info, - data_layout, act_info); + data_layout, act_info, false, in_place); } }; } // namespace validation -- cgit v1.2.1