aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCongLi <sicong.li@arm.com>2021-06-29 13:18:30 +0100
committerSiCong Li <sicong.li@arm.com>2021-07-02 12:05:37 +0000
commitbc4e31113be0af320f44b338969d6972b64ca4de (patch)
tree21e44fad55a80794a9609e847188764cf42f13d7
parent66831659fdef07c428993dccfa5d92416bae1ef9 (diff)
downloadComputeLibrary-bc4e31113be0af320f44b338969d6972b64ca4de.tar.gz
Implement FP GPU depthwise convolution 1x1 kernel for in-place computation
* Implement in-place graph node mutator for 1x1 depthwise convolution * Add in-place to validation fixture except for DepthwiseConvolutionLayerNativeValidationFixture as it would be a duplicate test otherwise (DepthwiseConvolutionLayerNative test tests the underlying kernel) Resolves: COMPMID-4432 Change-Id: Id7f10f5ebdce7d49f550c0b62dbaaab7f5b59d29 Signed-off-by: SiCongLi <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5874 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h35
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp33
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h95
-rw-r--r--src/graph/mutators/InPlaceOperationMutator.cpp75
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp17
-rw-r--r--tests/datasets/DepthwiseConvolutionLayerDataset.h19
-rw-r--r--tests/validation/CL/DepthwiseConvolutionLayer.cpp30
-rw-r--r--tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h83
8 files changed, 272 insertions, 115 deletions
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index f31a17d9cb..bbb00a1ebc 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -72,48 +72,33 @@ public:
* |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
* |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
*
+ * @param[in] compile_context The compile context to be used.
* @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
* @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
* Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
* @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
* Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: same as @p input.
* @param[in] conv_info Padding and stride information to use for the convolution.
* @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ *
+ * @note: For in-place support, please check @ref CLDepthwiseConvolutionLayerNativeKernel
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+
/** Initialize the function's source, destination, weights and convolution information.
*
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
- * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * Similar to @ref CLDepthwiseConvolutionLayer::configure()
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * Similar to @ref CLDepthwiseConvolutionLayer::configure()
*
* @return a status
*/
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 4cc0e462c4..eb1cf146af 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -46,7 +46,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
{
ARM_COMPUTE_UNUSED(dwc_info);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ bool in_place = false;
+ if(output == nullptr || output == input)
+ {
+ in_place = true;
+ output = input;
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
@@ -58,6 +64,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_UNUSED(idx_c);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
+ // In place restrictions
+ if(in_place)
+ {
+ const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride() != std::make_pair(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation != Size2D(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+ }
+
const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
@@ -139,19 +157,24 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
_type = CLKernelType::DEPTHWISE;
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts);
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
const DWCWeightsKernelInfo &dwc_weights_info,
const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ if(output == nullptr)
+ {
+ // In-place
+ output = input;
+ }
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
(output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 325f4e7067..068131f434 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,64 +49,47 @@ public:
CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
/** Initialize the function's source, destination and parameters
*
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+ * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+ * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
+ * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+ * @param[in] dwc_info Depthwise convolution layer info
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+ * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
* the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ *
+ * @note: In-place is only supported when
+ * * data layout: NHWC
+ * * filter: 1x1
+ * * @p depth_multiplier: 1
+ * * strides: 1
+ * * dilation: 1
+ * * no padding
+ * * no change of data layout after configure
*/
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
+ void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+
/** Initialize the function's source, destination and parameters
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
+ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor info. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
*
* @return a status
*/
@@ -118,14 +101,14 @@ public:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
- const ICLTensor *_input;
- const ICLTensor *_weights;
- const ICLTensor *_biases;
- ICLTensor *_output;
- unsigned int _depth_multiplier;
- const ICLTensor *_output_multipliers;
- const ICLTensor *_output_shifts;
- bool _is_quantized;
+ const ICLTensor *_input {};
+ const ICLTensor *_weights{};
+ const ICLTensor *_biases{};
+ ICLTensor *_output{};
+ unsigned int _depth_multiplier{ 0 };
+ const ICLTensor *_output_multipliers{};
+ const ICLTensor *_output_shifts{};
+ bool _is_quantized{ false };
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index 616ec5c73d..86236e8854 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -23,9 +23,15 @@
*/
#include "arm_compute/graph/mutators/InPlaceOperationMutator.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/graph/Graph.h"
#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
+#include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
+#include "support/Cast.h"
+
+using namespace arm_compute::utils::cast;
namespace arm_compute
{
@@ -82,6 +88,69 @@ void set_new_output_and_inherit_accessor(std::unique_ptr<INode> &node, Tensor *o
node->set_output_tensor(new_output->id(), 0);
}
+// Try to mutate the node to perform the depthwise in-place calculation
+void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
+{
+ // Get input edge
+ Edge *input_edge = node->input_edge(0);
+ Edge *weight_edge = node->input_edge(1);
+ ARM_COMPUTE_ERROR_ON(input_edge == nullptr || weight_edge == nullptr);
+
+ auto input_tensor = input_edge->tensor();
+ auto weight_tensor = weight_edge->tensor();
+ ARM_COMPUTE_ERROR_ON(input_tensor == nullptr || weight_tensor == nullptr);
+
+ const auto input_shape = input_tensor->desc().shape;
+ const auto qinfo_input = input_tensor->desc().quant_info;
+
+ const auto weight_shape = weight_tensor->desc().shape;
+ const auto weight_layout = weight_tensor->desc().layout;
+
+ // Extract PadStrideInfo and depth multiplier
+ PadStrideInfo conv_info{};
+ unsigned int depth_multiplier{};
+ if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
+ {
+ conv_info = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
+ depth_multiplier = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
+ }
+ else if(node->type() == NodeType::DepthwiseConvolutionLayer)
+ {
+ conv_info = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->convolution_info();
+ depth_multiplier = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->depth_multiplier();
+ }
+
+ // Get current output tensor
+ auto current_output_tensor = node->output(0);
+ ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr);
+ const auto out_shape = current_output_tensor->desc().shape;
+ const auto qinfo_out = current_output_tensor->desc().quant_info;
+
+ bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
+
+ // Specify conditions with which input can be in-placed
+ input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC;
+
+ const int weights_width_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::WIDTH);
+ const int weights_height_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::HEIGHT);
+ const bool is_1x1 = weight_shape[weights_width_idx] == 1U && weight_shape[weights_height_idx] == 1U;
+ input_can_in_place &= is_1x1;
+
+ input_can_in_place &= depth_multiplier == 1;
+ input_can_in_place &= conv_info.stride() == std::make_pair(1U, 1U);
+ input_can_in_place &= !conv_info.has_padding();
+ // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node
+
+ if(input_can_in_place)
+ {
+ set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor);
+ }
+ else
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+ }
+}
+
// Try to mutate the node to perform the elementwise in-place calculation
void try_in_place_elementwise(std::unique_ptr<INode> &node)
{
@@ -148,6 +217,8 @@ void InPlaceOperationMutator::mutate(Graph &g)
NodeType::BatchNormalizationLayer,
NodeType::EltwiseLayer,
NodeType::UnaryEltwiseLayer,
+ NodeType::DepthwiseConvolutionLayer,
+ NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
NodeType::PrintLayer
};
@@ -166,6 +237,10 @@ void InPlaceOperationMutator::mutate(Graph &g)
{
try_in_place_elementwise(node);
}
+ else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer)
+ {
+ try_in_place_depthwiseconv(node);
+ }
else
{
// Get current and new output tensors
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index c7520cd087..a826f85c5c 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -136,11 +136,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
ICLTensor *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
weights->info(),
biases != nullptr ? biases->info() : nullptr,
- output->info(),
+ output != nullptr ? output->info() : input->info(),
conv_info,
depth_multiplier,
act_info,
@@ -220,6 +220,11 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
+ const bool in_place = input == output || output == nullptr;
+ if(in_place)
+ {
+ output = input;
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
@@ -254,6 +259,7 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
if(needs_permute)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
TensorShape permuted_input_shape = input->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
@@ -309,7 +315,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
_output_shifts.map();
quantization::compute_quantized_multipliers_and_shifts(_input->info(),
_original_weights->info(),
- _output->info(),
+ _output != nullptr ? _output->info() : _input->info(),
reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
_output_multipliers.unmap();
@@ -549,6 +555,11 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
unsigned int depth_multiplier,
ActivationLayerInfo act_info, const Size2D &dilation)
{
+ if(output == nullptr)
+ {
+ // In-place
+ output = input;
+ }
_depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
dilation);
switch(_depth_conv_func)
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index a19e7ee8cf..53b5248374 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -262,6 +262,25 @@ public:
add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 4, 4, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U));
}
};
+
+/** Dataset containing in-place 1x1 depthwise convolution shapes.
+ *
+ * For a depthwise convolution op to be in-place:
+ * * Output has the same shape as the input;
+ * * 1x1 filter
+ * * stride == 1
+ * * dilations == 1
+ * * No paddings
+*/
+class SmallInPlaceDepthwiseConvolutionLayerDataset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+ SmallInPlaceDepthwiseConvolutionLayerDataset()
+ {
+ add_config(TensorShape(7U, 7U, 1U), Size2D(1U, 1U), PadStrideInfo(1, 1, 0, 0));
+ add_config(TensorShape(11U, 13U, 16U), Size2D(1U, 1U), PadStrideInfo(1, 1, 0, 0));
+ }
+};
} // namespace datasets
} // namespace test
} // namespace arm_compute
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index c88f7c1624..22922f41a2 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -156,6 +156,8 @@ template <typename T>
using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
template <typename T>
using CLDepthwiseConvolutionLayerMixedDataLayoutFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T, true>;
+template <typename T>
+using CLDepthwiseConvolutionLayerInPlaceFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T, false, true>;
TEST_SUITE(Float)
TEST_SUITE(FP16)
@@ -290,6 +292,19 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, f
}
TEST_SUITE_END() // Dilation
TEST_SUITE_END() // Generic
+
+TEST_SUITE(InPlace)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture<half>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(),
+ framework::dataset::make("DepthMultiplier", { 1 })),
+ framework::dataset::make("DataType",
+ DataType::F16)),
+ framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+ ActivationFunctionsDataset))
+{
+ validate(CLAccessor(_src), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // InPlace
TEST_SUITE_END() // FP16
TEST_SUITE(FP32)
@@ -355,7 +370,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerMixedD
framework::dataset::make("DataType",
DataType::F32)),
framework::dataset::make("DataLayout", DataLayout::NHWC)),
- framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+ framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
{
validate(CLAccessor(_target), _reference, tolerance_f32);
}
@@ -436,6 +451,19 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>,
}
TEST_SUITE_END() // Dilation
TEST_SUITE_END() // Generic
+
+TEST_SUITE(InPlace)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture<float>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(),
+ framework::dataset::make("DepthMultiplier", { 1 })),
+ framework::dataset::make("DataType",
+ DataType::F32)),
+ framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+ ActivationFunctionsDataset))
+{
+ validate(CLAccessor(_src), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // InPlace
TEST_SUITE_END() // FP32
TEST_SUITE_END() // Float
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index 19ec6b2560..c255cc5c13 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -59,8 +59,9 @@ public:
void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, Size2D dilation,
unsigned int depth_multiplier, DataType input_data_type, DataType weights_data_type,
QuantizationInfo input_quantization_info, QuantizationInfo weights_quantization_info, QuantizationInfo output_quantization_info,
- DataLayout data_layout, ActivationLayerInfo act_info, bool mixed_layout = false)
+ DataLayout data_layout, ActivationLayerInfo act_info, bool mixed_layout = false, bool in_place = false)
{
+ ARM_COMPUTE_ERROR_ON(mixed_layout && in_place);
_mixed_layout = mixed_layout;
_input_shape = in_shape;
_input_data_type = input_data_type;
@@ -73,6 +74,7 @@ public:
_act_info = act_info;
_depth_multiplier = depth_multiplier;
_dilation = dilation;
+ _in_place = in_place;
_bias_data_type = is_data_type_quantized(_input_data_type) ? DataType::S32 : _input_data_type;
@@ -101,13 +103,18 @@ public:
}
// Create tensors
- _src = create_tensor<TensorType>(input_shape, _input_data_type, 1, _input_quantization_info, _data_layout);
- _weights = create_tensor<TensorType>(weights_shape, _weights_data_type, 1, _weights_quantization_info, _data_layout);
- _biases = create_tensor<TensorType>(_biases_shape, _bias_data_type, 1, _input_quantization_info, _data_layout);
- _target = create_tensor<TensorType>(output_shape, _input_data_type, 1, _output_quantization_info, _data_layout);
+ _src = create_tensor<TensorType>(input_shape, _input_data_type, 1, _input_quantization_info, _data_layout);
+ _weights = create_tensor<TensorType>(weights_shape, _weights_data_type, 1, _weights_quantization_info, _data_layout);
+ _biases = create_tensor<TensorType>(_biases_shape, _bias_data_type, 1, _input_quantization_info, _data_layout);
+ TensorType *target_to_use = nullptr;
+ if(!_in_place)
+ {
+ _target = create_tensor<TensorType>(output_shape, _input_data_type, 1, _output_quantization_info, _data_layout);
+ target_to_use = &_target;
+ }
// Create Depthwise Convolution configure function
- _dwc.configure(&_src, &_weights, &_biases, &_target, _pad_stride_info, _depth_multiplier, _act_info, _dilation);
+ _dwc.configure(&_src, &_weights, &_biases, target_to_use, _pad_stride_info, _depth_multiplier, _act_info, _dilation);
ARM_COMPUTE_ASSERT(_src.info()->is_resizable());
ARM_COMPUTE_ASSERT(_weights.info()->is_resizable());
@@ -117,18 +124,26 @@ public:
void allocate_and_run_target()
{
- add_padding_x({ &_src, &_weights, &_biases, &_target }, _data_layout);
+ add_padding_x({ &_src, &_weights, &_biases }, _data_layout);
+ if(!_in_place)
+ {
+ add_padding_x({ &_target }, _data_layout);
+ }
// Allocate tensors
_src.allocator()->allocate();
_weights.allocator()->allocate();
_biases.allocator()->allocate();
- _target.allocator()->allocate();
ARM_COMPUTE_ASSERT(!_src.info()->is_resizable());
ARM_COMPUTE_ASSERT(!_weights.info()->is_resizable());
ARM_COMPUTE_ASSERT(!_biases.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!_target.info()->is_resizable());
+
+ if(!_in_place)
+ {
+ _target.allocator()->allocate();
+ ARM_COMPUTE_ASSERT(!_target.info()->is_resizable());
+ }
// Fill tensors
fill(AccessorType(_src), 0);
@@ -163,6 +178,7 @@ public:
protected:
void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
{
+ ARM_COMPUTE_ERROR_ON(_in_place);
// Test Multi DataLayout graph cases, when the data layout changes after configure
src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
dst.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
@@ -240,9 +256,10 @@ protected:
unsigned int _depth_multiplier{};
Size2D _dilation{};
bool _mixed_layout{ false };
+ bool _in_place{ false };
};
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false, bool in_place = false>
class DepthwiseConvolutionLayerValidationFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
{
public:
@@ -252,7 +269,7 @@ public:
{
DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier,
data_type, data_type, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(),
- data_layout, act_info, mixed_layout);
+ data_layout, act_info, mixed_layout, in_place);
}
};
@@ -393,7 +410,7 @@ protected:
unsigned int _depth_multiplier{};
};
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool in_place = false>
class DepthwiseConvolutionLayerNativeConfigurableValidationFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
{
public:
@@ -407,6 +424,7 @@ public:
_data_layout = data_layout;
_act_info = act_info;
_n0 = n0;
+ _in_place = in_place;
_input_shape = TensorShape(width, height, channel, batch);
_weights_shape = TensorShape(kernel_size.width, kernel_size.height, channel * _depth_multiplier);
@@ -434,10 +452,15 @@ public:
}
// Create tensors
- _src = create_tensor<TensorType>(input_shape, _data_type, 1, QuantizationInfo(), _data_layout);
- _weights = create_tensor<TensorType>(weights_shape, _data_type, 1, QuantizationInfo(), _data_layout);
- _biases = create_tensor<TensorType>(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout);
- _target = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
+ _src = create_tensor<TensorType>(input_shape, _data_type, 1, QuantizationInfo(), _data_layout);
+ _weights = create_tensor<TensorType>(weights_shape, _data_type, 1, QuantizationInfo(), _data_layout);
+ _biases = create_tensor<TensorType>(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout);
+ TensorType *target_to_use = nullptr;
+ if(!_in_place)
+ {
+ _target = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
+ target_to_use = &_target;
+ }
DWCWeightsKernelInfo dwc_weights_info;
dwc_weights_info.n0 = _n0;
@@ -446,7 +469,7 @@ public:
dwc_info.activation_info = _act_info;
// Create Depthwise Convolution configure function
- _dwc.configure(&_src, &_weights, &_biases, &_target, dwc_weights_info, dwc_info, _conv_info, _depth_multiplier, _dilation);
+ _dwc.configure(&_src, &_weights, &_biases, target_to_use, dwc_weights_info, dwc_info, _conv_info, _depth_multiplier, _dilation);
ARM_COMPUTE_ASSERT(_src.info()->is_resizable());
ARM_COMPUTE_ASSERT(_weights.info()->is_resizable());
@@ -462,12 +485,15 @@ public:
_src.allocator()->allocate();
_weights.allocator()->allocate();
_biases.allocator()->allocate();
- _target.allocator()->allocate();
ARM_COMPUTE_ASSERT(!_src.info()->is_resizable());
ARM_COMPUTE_ASSERT(!_weights.info()->is_resizable());
ARM_COMPUTE_ASSERT(!_biases.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!_target.info()->is_resizable());
+ if(!_in_place)
+ {
+ _target.allocator()->allocate();
+ ARM_COMPUTE_ASSERT(!_target.info()->is_resizable());
+ }
// Fill tensors
fill(AccessorType(_src), 0);
@@ -476,13 +502,19 @@ public:
// Test Multi DataLayout graph cases, when the data layout changes after configure
_src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
- _target.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+ if(!_in_place)
+ {
+ _target.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+ }
// Compute function
_dwc.run();
// Reinstating original data layout for the test suite to properly check the values
- _target.info()->set_data_layout(_data_layout);
+ if(!_in_place)
+ {
+ _target.info()->set_data_layout(_data_layout);
+ }
}
void compute_reference()
@@ -541,9 +573,10 @@ protected:
Size2D _dilation{};
unsigned int _depth_multiplier{};
unsigned int _n0{};
+ bool _in_place{ false };
};
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false, bool in_place = false>
class DepthwiseConvolutionLayerValidationQuantizedFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
{
public:
@@ -553,11 +586,11 @@ public:
{
DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier, data_type,
data_type, input_quantization_info, input_quantization_info, output_quantization_info,
- data_layout, act_info, mixed_layout);
+ data_layout, act_info, mixed_layout, in_place);
}
};
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW, bool in_place = false>
class DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, TW>
{
public:
@@ -579,7 +612,7 @@ public:
DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, TW>::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier,
input_data_type, weights_data_type,
input_quantization_info, QuantizationInfo(weights_scales), output_quantization_info,
- data_layout, act_info);
+ data_layout, act_info, false, in_place);
}
};
} // namespace validation