aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSiCongLi <sicong.li@arm.com>2021-06-29 13:18:30 +0100
committerSiCong Li <sicong.li@arm.com>2021-07-02 12:05:37 +0000
commitbc4e31113be0af320f44b338969d6972b64ca4de (patch)
tree21e44fad55a80794a9609e847188764cf42f13d7 /src
parent66831659fdef07c428993dccfa5d92416bae1ef9 (diff)
downloadComputeLibrary-bc4e31113be0af320f44b338969d6972b64ca4de.tar.gz
Implement FP GPU depthwise convolution 1x1 kernel for in-place computation
* Implement in-place graph node mutator for 1x1 depthwise convolution * Add in-place to validation fixture except for DepthwiseConvolutionLayerNativeValidationFixture as it would be a duplicate test otherwise (DepthwiseConvolutionLayerNative test tests the underlying kernel) Resolves: COMPMID-4432 Change-Id: Id7f10f5ebdce7d49f550c0b62dbaaab7f5b59d29 Signed-off-by: SiCongLi <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5874 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp33
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h95
-rw-r--r--src/graph/mutators/InPlaceOperationMutator.cpp75
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp17
4 files changed, 156 insertions, 64 deletions
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 4cc0e462c4..eb1cf146af 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -46,7 +46,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
{
ARM_COMPUTE_UNUSED(dwc_info);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ bool in_place = false;
+ if(output == nullptr || output == input)
+ {
+ in_place = true;
+ output = input;
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
@@ -58,6 +64,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_UNUSED(idx_c);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
+ // In place restrictions
+ if(in_place)
+ {
+ const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride() != std::make_pair(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation != Size2D(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+ }
+
const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
@@ -139,19 +157,24 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
_type = CLKernelType::DEPTHWISE;
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts);
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
const DWCWeightsKernelInfo &dwc_weights_info,
const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ if(output == nullptr)
+ {
+ // In-place
+ output = input;
+ }
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
(output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 325f4e7067..068131f434 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,64 +49,47 @@ public:
CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
/** Initialize the function's source, destination and parameters
*
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+ * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+ * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
+ * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+ * @param[in] dwc_info Depthwise convolution layer info
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+ * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
* the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ *
+ * @note: In-place is only supported when
+ * * data layout: NHWC
+ * * filter: 1x1
+ * * @p depth_multiplier: 1
+ * * strides: 1
+ * * dilation: 1
+ * * no padding
+ * * no change of data layout after configure
*/
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
+ void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+
/** Initialize the function's source, destination and parameters
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
+ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor info. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
*
* @return a status
*/
@@ -118,14 +101,14 @@ public:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
- const ICLTensor *_input;
- const ICLTensor *_weights;
- const ICLTensor *_biases;
- ICLTensor *_output;
- unsigned int _depth_multiplier;
- const ICLTensor *_output_multipliers;
- const ICLTensor *_output_shifts;
- bool _is_quantized;
+ const ICLTensor *_input {};
+ const ICLTensor *_weights{};
+ const ICLTensor *_biases{};
+ ICLTensor *_output{};
+ unsigned int _depth_multiplier{ 0 };
+ const ICLTensor *_output_multipliers{};
+ const ICLTensor *_output_shifts{};
+ bool _is_quantized{ false };
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index 616ec5c73d..86236e8854 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -23,9 +23,15 @@
*/
#include "arm_compute/graph/mutators/InPlaceOperationMutator.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/graph/Graph.h"
#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
+#include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
+#include "support/Cast.h"
+
+using namespace arm_compute::utils::cast;
namespace arm_compute
{
@@ -82,6 +88,69 @@ void set_new_output_and_inherit_accessor(std::unique_ptr<INode> &node, Tensor *o
node->set_output_tensor(new_output->id(), 0);
}
+// Try to mutate the node to perform the depthwise in-place calculation
+void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
+{
+ // Get input edge
+ Edge *input_edge = node->input_edge(0);
+ Edge *weight_edge = node->input_edge(1);
+ ARM_COMPUTE_ERROR_ON(input_edge == nullptr || weight_edge == nullptr);
+
+ auto input_tensor = input_edge->tensor();
+ auto weight_tensor = weight_edge->tensor();
+ ARM_COMPUTE_ERROR_ON(input_tensor == nullptr || weight_tensor == nullptr);
+
+ const auto input_shape = input_tensor->desc().shape;
+ const auto qinfo_input = input_tensor->desc().quant_info;
+
+ const auto weight_shape = weight_tensor->desc().shape;
+ const auto weight_layout = weight_tensor->desc().layout;
+
+ // Extract PadStrideInfo and depth multiplier
+ PadStrideInfo conv_info{};
+ unsigned int depth_multiplier{};
+ if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
+ {
+ conv_info = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
+ depth_multiplier = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
+ }
+ else if(node->type() == NodeType::DepthwiseConvolutionLayer)
+ {
+ conv_info = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->convolution_info();
+ depth_multiplier = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->depth_multiplier();
+ }
+
+ // Get current output tensor
+ auto current_output_tensor = node->output(0);
+ ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr);
+ const auto out_shape = current_output_tensor->desc().shape;
+ const auto qinfo_out = current_output_tensor->desc().quant_info;
+
+ bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
+
+ // Specify conditions with which input can be in-placed
+ input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC;
+
+ const int weights_width_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::WIDTH);
+ const int weights_height_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::HEIGHT);
+ const bool is_1x1 = weight_shape[weights_width_idx] == 1U && weight_shape[weights_height_idx] == 1U;
+ input_can_in_place &= is_1x1;
+
+ input_can_in_place &= depth_multiplier == 1;
+ input_can_in_place &= conv_info.stride() == std::make_pair(1U, 1U);
+ input_can_in_place &= !conv_info.has_padding();
+ // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node
+
+ if(input_can_in_place)
+ {
+ set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor);
+ }
+ else
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+ }
+}
+
// Try to mutate the node to perform the elementwise in-place calculation
void try_in_place_elementwise(std::unique_ptr<INode> &node)
{
@@ -148,6 +217,8 @@ void InPlaceOperationMutator::mutate(Graph &g)
NodeType::BatchNormalizationLayer,
NodeType::EltwiseLayer,
NodeType::UnaryEltwiseLayer,
+ NodeType::DepthwiseConvolutionLayer,
+ NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
NodeType::PrintLayer
};
@@ -166,6 +237,10 @@ void InPlaceOperationMutator::mutate(Graph &g)
{
try_in_place_elementwise(node);
}
+ else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer)
+ {
+ try_in_place_depthwiseconv(node);
+ }
else
{
// Get current and new output tensors
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index c7520cd087..a826f85c5c 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -136,11 +136,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
ICLTensor *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
weights->info(),
biases != nullptr ? biases->info() : nullptr,
- output->info(),
+ output != nullptr ? output->info() : input->info(),
conv_info,
depth_multiplier,
act_info,
@@ -220,6 +220,11 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
+ const bool in_place = input == output || output == nullptr;
+ if(in_place)
+ {
+ output = input;
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
@@ -254,6 +259,7 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
if(needs_permute)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
TensorShape permuted_input_shape = input->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
@@ -309,7 +315,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
_output_shifts.map();
quantization::compute_quantized_multipliers_and_shifts(_input->info(),
_original_weights->info(),
- _output->info(),
+ _output != nullptr ? _output->info() : _input->info(),
reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
_output_multipliers.unmap();
@@ -549,6 +555,11 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
unsigned int depth_multiplier,
ActivationLayerInfo act_info, const Size2D &dilation)
{
+ if(output == nullptr)
+ {
+ // In-place
+ output = input;
+ }
_depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
dilation);
switch(_depth_conv_func)