aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2021-04-08 12:02:58 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2021-04-19 13:45:08 +0000
commit60c3b0e6821a80d78ffca5be30e05d062d071cd2 (patch)
tree3e263a45aa9617cfd7704b2b33ea4337f1582321 /src
parent4f1650f0c9919f0bac5024b8e31c0f754d25aec3 (diff)
downloadComputeLibrary-60c3b0e6821a80d78ffca5be30e05d062d071cd2.tar.gz
Port DepthwiseConvolution to new API
Resolves: COMPMID-4185 Change-Id: Ib5f22356356a022d567bb18d44ea272b62d10ebf Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5424 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp9
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp8
-rw-r--r--src/core/NEON/NEKernels.h1
-rw-r--r--src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h131
-rw-r--r--src/core/cpu/kernels/CpuActivationKernel.cpp2
-rw-r--r--src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp (renamed from src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp)139
-rw-r--r--src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h117
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp14
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp409
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp521
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConvolution.h230
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp (renamed from src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp)265
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h97
14 files changed, 1368 insertions, 580 deletions
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index e8daa56f36..287a965f5b 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -105,7 +105,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
if(output->total_size() != 0)
{
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
@@ -116,7 +117,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
unsigned int depth_multiplier, GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation)
{
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const ConvolutionInfo info
+ {
+ conv_info, depth_multiplier, ActivationLayerInfo(), dilation
+ };
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, info);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
const unsigned int conv_stride_x = conv_info.stride().first;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 6400ba53e2..f7603e6397 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -65,8 +65,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
const size_t weights_width = 3;
const size_t weights_height = 3;
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
- *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
+ const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+ *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), info);
if(is_qasymm)
{
DepthwiseConvolutionReshapeInfo info;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index c34018a000..fcfa7f878d 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,7 +58,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_UNUSED(idx_c);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
const bool is_quantized = is_data_type_quantized(input->data_type());
@@ -156,7 +157,8 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
auto padding_info = get_padding_info({ input, output });
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info, depth_multiplier, dilation);
+ const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), info);
auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
_input = input;
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 264f521be2..e982470402 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -38,7 +38,6 @@
#include "src/core/NEON/kernels/NECropKernel.h"
#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
deleted file mode 100644
index 713cdcd9d9..0000000000
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/INEKernel.h"
-#include "support/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NEDepthwiseConvolutionLayerNativeKernel";
- }
- /** Default constructor */
- NEDepthwiseConvolutionLayerNativeKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDepthwiseConvolutionLayerNativeKernel(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDepthwiseConvolutionLayerNativeKernel &operator=(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
- /** Default Move Constructor. */
- NEDepthwiseConvolutionLayerNativeKernel(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
- /** Default move assignment operator */
- NEDepthwiseConvolutionLayerNativeKernel &operator=(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
- /** Default destructor */
- ~NEDepthwiseConvolutionLayerNativeKernel() = default;
- /** Initialize the function's source, destination and parameters.
- *
- * @note Supported data layouts: NHWC
- *
- * @param[in] input Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- *
- */
- void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- const Size2D &dilation = Size2D(1U, 1U));
- /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerNativeKernel
- *
- * @note Supported data layouts: NHWC
- *
- * @param[in] input Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor info. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- const Size2D &dilation = Size2D(1U, 1U));
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- template <typename T>
- using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
-
- template <typename T, typename TW, FloatEnalber<T> = 0>
- void run_depthwise(const Window &window, bool has_biases);
-
- template <typename T>
- using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
-
- template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
- void run_depthwise(const Window &window, bool has_biases);
-
- /** Common signature for all the specialised depthwise convolution native functions
- *
- * @param[in] window Region on which to execute the kernel.
- */
- using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases);
-
- DepthwiseFunctionPtr _func;
- const ITensor *_input;
- const ITensor *_weights;
- const ITensor *_biases;
- ITensor *_output;
- PadStrideInfo _conv_info;
- unsigned int _depth_multiplier;
- Size2D _dilation;
- std::vector<int> _output_multiplier;
- std::vector<int> _output_shift;
- bool _has_biases;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
index 761258941d..eb38c18cff 100644
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/core/cpu/kernels/CpuActivationKernel.cpp
@@ -205,7 +205,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
_act_info = activation_info;
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
index 24fd01fee1..a5d1b61c08 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
@@ -21,8 +21,10 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "src/core/CPP/Validate.h"
@@ -35,6 +37,10 @@
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
constexpr auto data_layout = DataLayout::NHWC;
@@ -716,19 +722,18 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITenso
input_it, weights_it, biases_it, output_it);
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- const Size2D &dilation)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
- ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > input->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > input->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * info.depth_multiplier) != weights->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
if(is_data_type_quantized_per_channel(weights->data_type()))
{
@@ -757,7 +762,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
if(output->total_size() != 0)
{
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -766,35 +771,30 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
}
} // namespace
-NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
- : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
+CpuDepthwiseConvolutionNativeKernel::CpuDepthwiseConvolutionNativeKernel()
+ : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
{
}
-void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
+void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
-
- _input = input;
- _weights = weights;
- _biases = biases;
- _output = output;
- _conv_info = conv_info;
- _depth_multiplier = depth_multiplier;
- _dilation = dilation;
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, weights, (biases != nullptr) ? biases : nullptr, output, info));
+
+ _conv_info = info.pad_stride_info;
+ _depth_multiplier = info.depth_multiplier;
+ _dilation = info.dilation;
_has_biases = (biases != nullptr);
- if(is_data_type_quantized(_input->info()->data_type()))
+ if(is_data_type_quantized(input->data_type()))
{
- const auto input_scale = input->info()->quantization_info().uniform().scale;
- const auto output_scale = output->info()->quantization_info().uniform().scale;
+ const auto input_scale = input->quantization_info().uniform().scale;
+ const auto output_scale = output->quantization_info().uniform().scale;
- auto weights_scale = weights->info()->quantization_info().scale();
- if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
+ auto weights_scale = weights->quantization_info().scale();
+ if(!is_data_type_quantized_per_channel(weights->data_type()))
{
- for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)
+ for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
{
weights_scale.push_back(weights_scale.front());
}
@@ -812,100 +812,107 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
}
}
- switch(_weights->info()->data_type())
+ switch(weights->data_type())
{
case DataType::QASYMM8:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
+ _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
break;
case DataType::QASYMM8_SIGNED:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
+ _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
break;
case DataType::QSYMM8_PER_CHANNEL:
- if(_input->info()->data_type() == DataType::QASYMM8)
+ if(input->data_type() == DataType::QASYMM8)
{
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
+ _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
}
else
{
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
+ _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
}
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
+ _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
+ _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
}
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+ auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
- Window win = calculate_max_window(*output->info(), Steps());
- INEKernel::configure(win);
+ Window win = calculate_max_window(*output, Steps());
+ ICpuKernel::configure(win);
}
-Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const Size2D &dilation)
+Status CpuDepthwiseConvolutionNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, info));
return Status{};
}
-void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- (this->*_func)(window, _has_biases);
-}
-
-template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
-void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
+void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+ ITensor *dst, const Window &window, bool has_biases)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
if(_depth_multiplier == 1)
{
- depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
+ depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
}
else
{
- depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases);
+ depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
}
}
-template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
-void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
+void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+ ITensor *dst, const Window &window, bool has_biases)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
if(_depth_multiplier == 1)
{
- depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
+ depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
}
else
{
const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
- const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(_weights->info()->data_type()));
+ const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
{
- depthwise_loop_pow2_quantized_per_tensor<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
+ depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
}
else
{
- depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
+ depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
}
}
}
+
+void CpuDepthwiseConvolutionNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+ (this->*_func)(src, weights, biases, dst, window, _has_biases);
+}
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
new file mode 100644
index 0000000000..242536d441
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
+
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "support/Requires.h"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_neon.h>
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to run a depthwise convolution native on a tensor. */
+class CpuDepthwiseConvolutionNativeKernel : public ICpuKernel
+{
+public:
+ const char *name() const override
+ {
+ return "CpuDepthwiseConvolutionNativeKernel";
+ }
+ /** Default constructor */
+ CpuDepthwiseConvolutionNativeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConvolutionNativeKernel);
+
+ /** Initialize the function's source, destination and parameters.
+ *
+ * @note Supported data layouts: NHWC
+ *
+ * @param[in] input Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Data type supported: Same as @p input.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ */
+ void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionNativeKernel
+ *
+ * @note Supported data layouts: NHWC
+ *
+ * @param[in] input Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] output Destination tensor info. Data type supported: Same as @p input.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+ template <typename T>
+ using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
+
+ template <typename T, typename TW, FloatEnalber<T> = 0>
+ void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+ template <typename T>
+ using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
+
+ template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
+ void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+ /** Common signature for all the specialised depthwise convolution native functions
+ *
+ * @param[in] window Region on which to execute the kernel.
+ */
+ using DepthwiseFunctionPtr = void (CpuDepthwiseConvolutionNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+
+ DepthwiseFunctionPtr _func;
+ PadStrideInfo _conv_info;
+ unsigned int _depth_multiplier;
+ Size2D _dilation;
+ std::vector<int> _output_multiplier;
+ std::vector<int> _output_shift;
+ bool _has_biases;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H */
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index f7517a50a3..8e3d010786 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -82,9 +82,10 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh
if(needs_permute)
{
- TensorShape permuted_input_shape = input->tensor_shape();
- TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
@@ -272,9 +273,10 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
if(needs_permute)
{
- TensorShape permuted_input_shape = input->tensor_shape();
- TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index d17f6b5cd9..e1ceb0f083 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,54 +27,39 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
namespace arm_compute
{
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
+
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- if(!is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
- if(biases != nullptr)
+ ITensor *src{ nullptr }; // SRC_0
+ ITensor *dst{ nullptr }; // DST_0
+ const ITensor *weights
{
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
- }
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
-
- //Validate Activation Layer
- if(act_info.enabled())
+ nullptr
+ }; // SRC_1
+ const ITensor *biases
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
- }
- return Status{};
-}
-} // namespace
-
-NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
+ nullptr
+ }; // SRC_2
+ Tensor permuted_input{}; // INT_0
+ Tensor permuted_weights{}; // INT_1
+ Tensor permuted_output{}; // INT_2
+ Tensor workspace{}; // INT_3
+ Tensor packed_weights{}; // INT_4
+ std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+ bool is_prepared{ false };
+ bool permute{ false };
+};
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _dwc_optimized_func(memory_manager), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
- _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+ : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
{
}
@@ -87,65 +72,76 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
- output->info(), conv_info, depth_multiplier, act_info, dilation));
-
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _has_bias = biases != nullptr;
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _permute = _is_nchw;
- _is_prepared = false;
- _is_activationlayer_enabled = act_info.enabled();
+
+ bool is_nhwc = input->info()->data_layout() == DataLayout::NCHW;
+ _impl->src = input;
+ _impl->weights = weights;
+ _impl->biases = biases;
+ _impl->dst = output;
+ _impl->permute = is_nhwc;
+
+ _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+ ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
+ _impl->dst->info(), info);
// Configure pipeline
- ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
- const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
- const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
- _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
- if(!_is_activationlayer_enabled)
+ ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+ bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
+
+ if(!is_activationlayer_enabled)
{
act_info_to_use = act_info;
}
+ info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
- if(_is_nchw)
+ auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConvolutionAssemblyDispatch>();
+
+ if(is_nhwc)
{
- _memory_group.manage(&_permuted_input);
- _memory_group.manage(&_permuted_output);
+ auto permute_input = std::make_unique<cpu::CpuPermute>();
+ auto permute_weights = std::make_unique<cpu::CpuPermute>();
+ auto permute_output = std::make_unique<cpu::CpuPermute>();
+
+ _memory_group.manage(&_impl->permuted_input);
+ _memory_group.manage(&_impl->permuted_weights);
+ _memory_group.manage(&_impl->permuted_output);
// Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
- _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+ permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+ _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
// Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
- _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+ permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+ _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+ _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
// Configure optimized depthwise
- _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+ dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases->info(), _impl->permuted_output.info(), info);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+ _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_output.allocator()->allocate();
+ _impl->permuted_input.allocator()->allocate();
+ _impl->permuted_output.allocator()->allocate();
}
else
{
- _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
+ dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases->info(), _impl->dst->info(), info);
}
- // Configure activation
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
+ // Allocate memory based on the internal memory requirements
+ experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
+ _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size }, 1, DataType::S8), mem_req[0].alignment);
+ _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size }, 1, DataType::S8), mem_req[1].alignment);
+
+ _impl->workspace.allocator()->allocate();
+ _impl->packed_weights.allocator()->allocate();
}
Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
@@ -157,63 +153,66 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
const ActivationLayerInfo &act_info,
const Size2D &dilation)
{
- return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
{
prepare();
-
MemoryGroupResourceScope scope_mg(_memory_group);
- // Permute input
- if(_permute)
- {
- _permute_input.run();
- }
-
- // Run assembly function
- _dwc_optimized_func.run();
-
- // Permute output
- if(_is_nchw)
- {
- _permute_output.run();
- }
-
- // Run activation
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+ pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+ pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+ pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+ pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
+ pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
+ pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+ _impl->op->run(pack);
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
{
- if(!_is_prepared)
+ if(!_impl->is_prepared)
{
// Permute weights
- if(_permute)
+ if(_impl->permute)
{
- _permuted_weights.allocator()->allocate();
- _permute_weights.run();
- _original_weights->mark_as_unused();
+ _impl->permuted_weights.allocator()->allocate();
+ _impl->weights->mark_as_unused();
}
- // Prepare optimized function
- _dwc_optimized_func.prepare();
- if(!_permuted_weights.is_used())
+ if(!_impl->permuted_weights.is_used())
{
- _permuted_weights.allocator()->free();
+ _impl->permuted_weights.allocator()->free();
}
- _is_prepared = true;
+ _impl->is_prepared = true;
}
}
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
+{
+ Tensor permuted_input{};
+ Tensor permuted_weights{};
+ Tensor permuted_output{};
+ bool is_prepared{ false };
+ bool is_nchw{ false };
+ bool is_activationlayer_enabled{ false };
+ const ITensor *weights{ nullptr };
+ const ITensor *biases{ nullptr };
+ const ITensor *src{ nullptr };
+ ITensor *dst{ nullptr };
+ std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+};
+
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
- : _depthwise_conv_kernel(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
- _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+ : _impl(std::make_unique<Impl>())
{
}
@@ -224,45 +223,49 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
output->info(), conv_info, depth_multiplier, act_info, dilation));
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _is_prepared = !_is_nchw;
+ const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ _impl->op = std::make_unique<cpu::CpuDepthwiseConvolution>();
+ _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
+
+ _impl->src = input;
+ _impl->dst = output;
+ _impl->weights = weights;
+ _impl->biases = biases;
+ _impl->is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _impl->is_prepared = !_impl->is_nchw;
ITensor *input_to_use = input;
const ITensor *weights_to_use = weights;
ITensor *output_to_use = output;
- if(_is_nchw)
+ if(_impl->is_nchw)
{
- _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
- _permuted_input.info()->set_data_layout(DataLayout::NHWC);
- input_to_use = &_permuted_input;
+ auto permute_input = std::make_unique<cpu::CpuPermute>();
+ auto permute_weights = std::make_unique<cpu::CpuPermute>();
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
- _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
- weights_to_use = &_permuted_weights;
+ permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+ _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
+ input_to_use = &_impl->permuted_input;
- _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
- output_to_use = &_permuted_output;
+ permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+ _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+ weights_to_use = &_impl->permuted_weights;
+
+ _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+ output_to_use = &_impl->permuted_output;
}
- _original_weights = weights_to_use;
- _depthwise_conv_kernel = std::make_unique<NEDepthwiseConvolutionLayerNativeKernel>();
- _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
+ auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+ depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases->info(), output_to_use->info(), info);
- if(_is_nchw)
+ if(_impl->is_nchw)
{
- _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ auto permute_output = std::make_unique<cpu::CpuPermute>();
+ permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
+ _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permuted_input.allocator()->allocate();
- _permuted_weights.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
-
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
+ _impl->permuted_input.allocator()->allocate();
+ _impl->permuted_weights.allocator()->allocate();
+ _impl->permuted_output.allocator()->allocate();
}
}
@@ -270,89 +273,53 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate
const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- if(input->data_layout() == DataLayout::NCHW)
- {
- TensorShape permuted_input_shape = input->tensor_shape();
- TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
- permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
- permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
- permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
- const TensorInfo permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, dilation));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, dilation));
- }
-
- // Validate Activation Layer
- if(act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
- }
-
- return Status{};
+ ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
{
- if(_is_nchw)
- {
- prepare();
- _permute_input.run();
- }
-
- NEScheduler::get().schedule(_depthwise_conv_kernel.get(), Window::DimY);
-
- if(_is_nchw)
- {
- _permute_output.run();
- }
-
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+ pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+ pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+ pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+ pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+ _impl->op->run(pack);
}
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::prepare()
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
{
- if(!_is_prepared)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _permute_weights.run();
- _original_weights->mark_as_unused();
- _is_prepared = true;
- }
}
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(std::move(memory_manager)), _func_generic()
+#ifndef DOXYGEN_SKIP_THIS
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
{
-}
+ DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
+ NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+ NEDepthwiseConvolutionLayerGeneric func_generic{};
+ std::shared_ptr<cpu::CpuDepthwiseConvolution> op{ nullptr };
+};
+#endif // DOXYGEN_SKIP_THIS
void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
const ActivationLayerInfo &act_info, const Size2D &dilation)
{
- _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation);
- switch(_depth_conv_func)
+ const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ _impl->op = std::make_shared<cpu::CpuDepthwiseConvolution>();
+ _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+ info);
+ switch(_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
break;
case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
break;
default:
ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
@@ -362,43 +329,19 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
- DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- switch(depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- return NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- return NEDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
- }
-}
-
-DepthwiseConvolutionFunction NEDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
- if(bool(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)))
- {
- return DepthwiseConvolutionFunction::OPTIMIZED;
- }
- else
- {
- return DepthwiseConvolutionFunction::GENERIC;
- }
+ ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::run()
{
- switch(_depth_conv_func)
+ switch(_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_optimized.run();
+ _impl->func_optimized.run();
break;
case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.run();
+ _impl->func_generic.run();
break;
default:
ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
@@ -407,13 +350,13 @@ void NEDepthwiseConvolutionLayer::run()
void NEDepthwiseConvolutionLayer::prepare()
{
- switch(_depth_conv_func)
+ switch(_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_optimized.prepare();
+ _impl->func_optimized.prepare();
break;
case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.prepare();
+ _impl->func_generic.prepare();
break;
default:
ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
new file mode 100644
index 0000000000..183a2af0cd
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ if(!is_data_type_quantized_per_channel(weights->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > input->dimension(idx_w) + info.pad_stride_info.pad_left() +
+ info.pad_stride_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > input->dimension(idx_h) + info.pad_stride_info.pad_top() +
+ info.pad_stride_info.pad_bottom());
+
+ if(biases != nullptr)
+ {
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, info));
+
+ //Validate Activation Layer
+ if(info.act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
+ }
+ return Status{};
+}
+} // namespace
+
+CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::CpuDepthwiseConvolutionOptimizedInternal()
+ : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
+ _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configure(ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *output,
+ const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, (biases == nullptr) ? nullptr : biases,
+ output, info));
+
+ _is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ _has_bias = biases != nullptr;
+ _is_nchw = input->data_layout() == DataLayout::NCHW;
+ _permute = _is_nchw;
+ _is_prepared = false;
+
+ // Configure pipeline
+ ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+ _is_activationlayer_enabled = info.act_info.enabled() && !(is_relu || is_relu6);
+
+ if(!_is_activationlayer_enabled)
+ {
+ act_info_to_use = info.act_info;
+ }
+
+ _dwc_optimized_func = std::make_unique<CpuDepthwiseConvolutionAssemblyDispatch>();
+ if(_is_nchw)
+ {
+ _permute_input = std::make_unique<cpu::CpuPermute>();
+ _permute_weights = std::make_unique<cpu::CpuPermute>();
+ _permute_output = std::make_unique<cpu::CpuPermute>();
+
+ auto input_perm = std::make_unique<TensorInfo>();
+ auto weights_perm = std::make_unique<TensorInfo>();
+ auto output_perm = std::make_unique<TensorInfo>();
+
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
+ input_perm->set_data_layout(DataLayout::NHWC);
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+ weights_perm->set_data_layout(DataLayout::NHWC);
+
+ output_perm->set_data_layout(DataLayout::NHWC);
+ output_perm->set_quantization_info(output->quantization_info());
+
+ // Configure optimized depthwise
+ _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
+
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ output_perm->set_data_layout(DataLayout::NHWC);
+ _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
+ }
+ else
+ {
+ _dwc_optimized_func->configure(input, weights, biases, output, info);
+ }
+
+ // Configure activation
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+ _activationlayer_function->configure(output, nullptr, info.act_info);
+ }
+}
+
+Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const ConvolutionInfo &info)
+{
+ return validate_arguments_optimized(input, weights, biases, output, info);
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ prepare(tensors);
+
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
+ auto workspace = tensors.get_tensor(TensorType::ACL_INT_3);
+ auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+ // Permute input
+ if(_permute)
+ {
+ ITensorPack pack;
+ auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
+ auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+ pack.add_tensor(TensorType::ACL_SRC, src);
+ pack.add_tensor(TensorType::ACL_DST, src_perm);
+ _permute_input->run(pack);
+ }
+
+ // Run assembly function
+ if(_is_nchw)
+ {
+ auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+ auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
+ pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
+ pack.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack.add_tensor(TensorType::ACL_INT_0, workspace);
+ pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+ pack.add_tensor(TensorType::ACL_DST, dst_perm);
+ _dwc_optimized_func->run(pack);
+ }
+ else
+ {
+ auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
+ auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, src);
+ pack.add_tensor(TensorType::ACL_SRC_1, weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack.add_tensor(TensorType::ACL_INT_0, workspace);
+ pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _dwc_optimized_func->run(pack);
+ }
+
+ // Permute output
+ if(_is_nchw)
+ {
+ ITensorPack pack;
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+ pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _permute_output->run(pack);
+ }
+
+ // Run activation
+ if(_is_activationlayer_enabled)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, dst);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _activationlayer_function->run(pack);
+ }
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+ // Permute weights
+ if(_permute)
+ {
+ auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, weights);
+ pack.add_tensor(TensorType::ACL_DST, permuted_weights);
+ _permute_weights->run(pack);
+
+ ITensorPack pack_opt;
+ pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
+ pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+ // Prepare optimized function
+ _dwc_optimized_func->prepare(pack_opt);
+ }
+ else
+ {
+ ITensorPack pack_opt;
+ pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
+ pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+ // Prepare optimized function
+ _dwc_optimized_func->prepare(pack_opt);
+ }
+
+ _is_prepared = true;
+ }
+}
+
+CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::CpuDepthwiseConvolutionGeneric()
+ : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
+ _is_activationlayer_enabled(false)
+{
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolution::validate(input, weights, (biases == nullptr) ? nullptr : biases,
+ output, info));
+
+ _is_nchw = input->data_layout() == DataLayout::NCHW;
+ _is_prepared = !_is_nchw;
+
+ ITensorInfo *input_to_use = input;
+ const ITensorInfo *weights_to_use = weights;
+ ITensorInfo *output_to_use = output;
+
+ auto input_perm = std::make_unique<TensorInfo>();
+ auto weights_perm = std::make_unique<TensorInfo>();
+ auto output_perm = std::make_unique<TensorInfo>(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+
+ if(_is_nchw)
+ {
+ _permute_input = std::make_unique<cpu::CpuPermute>();
+ _permute_weights = std::make_unique<cpu::CpuPermute>();
+
+ _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U));
+ input_perm->set_data_layout(DataLayout::NHWC);
+ input_to_use = input_perm.get();
+
+ _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+ weights_perm->set_data_layout(DataLayout::NHWC);
+ weights_to_use = weights_perm.get();
+
+ output_to_use = output_perm.get();
+ }
+
+ _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConvolutionNativeKernel>();
+ _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
+
+ if(_is_nchw)
+ {
+ _permute_output = std::make_unique<cpu::CpuPermute>();
+ _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U));
+ output_perm->set_data_layout(DataLayout::NHWC);
+ }
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = info.act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+ _activationlayer_function->configure(output, nullptr, info.act_info);
+ }
+}
+
+Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ if(input->data_layout() == DataLayout::NCHW)
+ {
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+ permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+ const TensorInfo permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(input, weights, biases, output, info));
+ }
+
+ // Validate Activation Layer
+ if(info.act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info));
+ }
+
+ return Status{};
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::run(ITensorPack &tensors)
+{
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
+
+ if(_is_nchw)
+ {
+ prepare(tensors);
+ auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+ auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, src);
+ pack.add_tensor(TensorType::ACL_DST, src_perm);
+ _permute_input->run(pack);
+
+ ITensorPack pack_depth;
+ pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
+ pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
+ pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+ pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ }
+ else
+ {
+ ITensorPack pack_depth;
+ pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
+ pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
+ pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+ pack_depth.add_tensor(TensorType::ACL_DST, dst);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ }
+
+ if(_is_nchw)
+ {
+ ITensorPack pack;
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+ pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _permute_output->run(pack);
+ }
+
+ if(_is_activationlayer_enabled)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, dst);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _activationlayer_function->run(pack);
+ }
+}
+
+void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+
+ ARM_COMPUTE_ERROR_ON(!weights->is_used());
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, weights);
+ pack.add_tensor(TensorType::ACL_DST, weights_perm);
+
+ _permute_weights->run(pack);
+ weights->mark_as_unused();
+ _is_prepared = true;
+ }
+}
+
+CpuDepthwiseConvolution::CpuDepthwiseConvolution()
+ : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
+{
+}
+
+void CpuDepthwiseConvolution::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+{
+ _depth_conv_func = get_depthwiseconvolution_function(input, weights, (biases != nullptr) ? biases : nullptr, output, info);
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.configure(input, weights, biases, output, info);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.configure(input, weights, biases, output, info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+ }
+}
+
+Status CpuDepthwiseConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+{
+ DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, info);
+ switch(depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ return CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ return CpuDepthwiseConvolutionGeneric::validate(input, weights, biases, output, info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+ }
+}
+
+DepthwiseConvolutionFunction CpuDepthwiseConvolution::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const ConvolutionInfo &info)
+{
+ if(bool(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info)))
+ {
+ return DepthwiseConvolutionFunction::OPTIMIZED;
+ }
+ else
+ {
+ return DepthwiseConvolutionFunction::GENERIC;
+ }
+}
+
+void CpuDepthwiseConvolution::run(ITensorPack &tensors)
+{
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.run(tensors);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.run(tensors);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+ }
+}
+
+void CpuDepthwiseConvolution::prepare(ITensorPack &tensors)
+{
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.prepare(tensors);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.prepare(tensors);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
new file mode 100644
index 0000000000..e39cb7db4d
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEQUANTIZATION_H
+#define ARM_COMPUTE_CPU_DEQUANTIZATION_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to execute a depthwise convolution.
+ */
+class CpuDepthwiseConvolution : public ICpuOperator
+{
+public:
+ /** Default constructor */
+ CpuDepthwiseConvolution();
+ /** Initialize the function's source, destination, weights and convolution information.
+ *
+ * @param[in, out] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[out] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] weights Weights tensor info. These are 3D tensors info with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+ /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConvolution
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return a Depthwise Convolution Function
+ */
+ static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const ConvolutionInfo &info);
+
+ // Inherited methods overriden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+
+private:
+ /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
+ *
+ * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+ *
+ * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+ * -# @ref CpuDepthwiseConvolution3x3Kernel if 3x3 and no assembly kernel implementation is present
+ * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+ * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
+ * -# @ref NEActivationLayer if fused activation is required
+ *
+ */
+ class CpuDepthwiseConvolutionOptimizedInternal : public ICpuOperator
+ {
+ public:
+ /** Default constructor */
+ CpuDepthwiseConvolutionOptimizedInternal();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConvolutionOptimizedInternal(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
+ /** Default move constructor */
+ CpuDepthwiseConvolutionOptimizedInternal(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConvolutionOptimizedInternal &operator=(const CpuDepthwiseConvolutionOptimizedInternal &) = delete;
+ /** Default move assignment operator */
+ CpuDepthwiseConvolutionOptimizedInternal &operator=(CpuDepthwiseConvolutionOptimizedInternal &&) = default;
+ /** Default destructor */
+ ~CpuDepthwiseConvolutionOptimizedInternal() = default;
+ /** Initialize the function's source, destination, kernels and border_size.
+ *
+ * @param[in, out] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution3x3
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+ // Inherited methods overriden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+
+ private:
+ std::unique_ptr<CpuDepthwiseConvolutionAssemblyDispatch> _dwc_optimized_func{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_input{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_output{ nullptr };
+ std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
+ bool _has_bias{ false };
+ bool _is_quantized{ false };
+ bool _is_nchw{ true };
+ bool _permute{ false };
+ bool _is_activationlayer_enabled{ false };
+ bool _is_prepared{ false };
+ };
+
+ /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
+ *
+ * -# @ref CpuDepthwiseConvolutionNativeKernel
+ *
+ */
+ class CpuDepthwiseConvolutionGeneric : public ICpuOperator
+ {
+ public:
+ /** Default constructor */
+ CpuDepthwiseConvolutionGeneric();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConvolutionGeneric(const CpuDepthwiseConvolutionGeneric &) = delete;
+ /** Default move constructor */
+ CpuDepthwiseConvolutionGeneric(CpuDepthwiseConvolutionGeneric &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConvolutionGeneric &operator=(const CpuDepthwiseConvolutionGeneric &) = delete;
+ /** Default move assignment operator */
+ CpuDepthwiseConvolutionGeneric &operator=(CpuDepthwiseConvolutionGeneric &&) = default;
+ /** Default destructor */
+ ~CpuDepthwiseConvolutionGeneric() = default;
+ /** Initialize the function's source, destination, weights and convolution information.
+ *
+ * @param[in, out] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+ * @param[out] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionGeneric
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+ * @param[in] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+
+ private:
+ std::unique_ptr<kernels::CpuDepthwiseConvolutionNativeKernel> _depthwise_conv_kernel{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_input{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_output{ nullptr };
+ std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
+ bool _is_nchw{ true };
+ bool _is_prepared{ false };
+ bool _is_activationlayer_enabled{ false };
+ };
+
+ DepthwiseConvolutionFunction _depth_conv_func;
+ CpuDepthwiseConvolutionOptimizedInternal _func_optimized;
+ CpuDepthwiseConvolutionGeneric _func_generic;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEQUANTIZATION_H */
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
index 101df98b7d..5f5304cded 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Utils.h"
@@ -41,6 +41,8 @@
namespace arm_compute
{
+namespace cpu
+{
namespace
{
std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
@@ -209,40 +211,37 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_
}
}
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor *input,
- const ITensor *weights,
- ITensor *output,
- PadStrideInfo conv_info,
- ActivationLayerInfo act_info,
- const Size2D &dilation)
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ ITensorInfo *output,
+ const ConvolutionInfo &info)
{
- ARM_COMPUTE_UNUSED(dilation);
- const DataType data_type = input->info()->data_type();
- const TensorShape shape = input->info()->tensor_shape();
+ const DataType data_type = input->data_type();
+ const TensorShape shape = input->tensor_shape();
const int n_batches = shape[3];
const int in_rows = shape.z();
const int in_cols = shape.y();
const int n_channels = shape.x();
- const int dilation_factor = dilation.x();
- const int padding_top = conv_info.pad_top();
- const int padding_left = conv_info.pad_left();
- const int padding_bottom = conv_info.pad_bottom();
- const int padding_right = conv_info.pad_right();
+ const int dilation_factor = info.dilation.x();
+ const int padding_top = info.pad_stride_info.pad_top();
+ const int padding_left = info.pad_stride_info.pad_left();
+ const int padding_bottom = info.pad_stride_info.pad_bottom();
+ const int padding_right = info.pad_stride_info.pad_right();
- const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QASYMM8);
- const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
+ const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
+ const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
- const unsigned int stride_x = conv_info.stride().first;
- const unsigned int kernel_size = weights->info()->tensor_shape().y();
+ const unsigned int stride_x = info.pad_stride_info.stride().first;
+ const unsigned int kernel_size = weights->tensor_shape().y();
// Map activation function
neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
- if(arm_compute::utils::info_helpers::is_relu(act_info))
+ if(arm_compute::utils::info_helpers::is_relu(info.act_info))
{
activation = neon_convolution_kernels::ActivationFunction::ReLU;
}
- else if(arm_compute::utils::info_helpers::is_relu6(act_info))
+ else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
{
activation = neon_convolution_kernels::ActivationFunction::ReLU6;
}
@@ -250,9 +249,9 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
// Create quantized convolver
if(is_uniform_quantized)
{
- const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo weights_qinfo = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo output_qinfo = output->info()->quantization_info().uniform();
+ const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform();
+ const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
+ const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
// Check that quantization info are in the range [0, 255]
ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
@@ -274,9 +273,9 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
}
else if(is_perchannel_quantized)
{
- const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
- const QuantizationInfo weights_qinfo = weights->info()->quantization_info();
- const UniformQuantizationInfo output_qinfo = output->info()->quantization_info().uniform();
+ const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform();
+ const QuantizationInfo weights_qinfo = weights->quantization_info();
+ const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
// Check that quantization info are in the range [0, 255]
ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
@@ -328,83 +327,75 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
}
} // namespace
-struct NEDepthwiseConvolutionAssemblyDispatch::LocalImpl
+struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl
{
- std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel{ nullptr };
- NEDepthwiseConvolutionAssemblyKernelWrapper _dwc_acl_kernel{};
+ std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
+ NEDepthwiseConvolutionAssemblyKernelWrapper dwc_acl_kernel{};
+ bool is_prepared{ false };
+ experimental::MemoryRequirements mem_req{};
};
#ifndef DOXYGEN_SKIP_THIS
-NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false),
- _pImpl(std::make_unique<LocalImpl>())
+CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch()
+ : _pImpl(std::make_unique<LocalImpl>())
{
}
#endif /* DOXYGEN_SKIP_THIS */
-NEDepthwiseConvolutionAssemblyDispatch::~NEDepthwiseConvolutionAssemblyDispatch() = default;
+CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch() = default;
-void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor *input,
- const ITensor *weights,
- const ITensor *bias,
- ITensor *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
+void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_UNUSED(depth_multiplier);
- ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
- weights->info(),
- bias != nullptr ? bias->info() : nullptr,
- output->info(),
- conv_info,
- depth_multiplier,
- act_info,
- dilation));
+ ARM_COMPUTE_UNUSED(bias);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionAssemblyDispatch::validate(input,
+ weights,
+ bias != nullptr ? bias : nullptr,
+ output,
+ info));
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+ auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
- _input = input;
- _weights = weights;
- _bias = bias;
- _output = output;
- _is_prepared = false;
+ _pImpl->is_prepared = false;
// Create convolver
- _pImpl->_dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info, dilation);
- ARM_COMPUTE_ERROR_ON(_pImpl->_dwc_assembly_kernel == nullptr);
+ _pImpl->dwc_assembly_kernel = create_convolver(input, weights, output, info);
+ ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
// Create assembly kernel wrapper
- _pImpl->_dwc_acl_kernel.configure(_pImpl->_dwc_assembly_kernel.get());
+ _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
constexpr size_t alignment = 128;
// Create workspace
const unsigned int num_threads = NEScheduler::get().num_threads();
- const size_t workspace_size = _pImpl->_dwc_assembly_kernel->get_working_space_size(num_threads);
+ const size_t workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
- _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
- _memory_group.manage(&_workspace);
- _workspace.allocator()->allocate();
+ _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
// Create packing tensor
- const size_t pack_tensor_size = _pImpl->_dwc_assembly_kernel->get_packed_params_size();
+ const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
- _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
+
+ _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
+}
+
+experimental::MemoryRequirements CpuDepthwiseConvolutionAssemblyDispatch::workspace() const
+{
+ return _pImpl->mem_req;
}
-Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo *input,
- const ITensorInfo *weights,
- const ITensorInfo *bias,
- const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
+Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -415,12 +406,12 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
// Validate convolver
- ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, info));
// Validate activation
- const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
- const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
// Check bias
if(bias != nullptr)
@@ -433,7 +424,7 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
// Check output
if(output->total_size() != 0)
{
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -451,11 +442,9 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
return Status{};
}
-bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
- const ITensorInfo *weights,
- PadStrideInfo conv_info,
- unsigned int depth_multiplier,
- const Size2D &dilation)
+bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
@@ -486,84 +475,90 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
bool weights_supported = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
// Check for supported strides
- const auto &strides = conv_info.stride();
+ const auto &strides = info.pad_stride_info.stride();
bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
// Check for supported padding
- const auto pad_top = conv_info.pad_top();
- const auto pad_right = conv_info.pad_right();
- const auto pad_bottom = conv_info.pad_bottom();
- const auto pad_left = conv_info.pad_left();
- PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), conv_info, DataLayout::NCHW, dilation);
+ const auto pad_top = info.pad_stride_info.pad_top();
+ const auto pad_right = info.pad_stride_info.pad_right();
+ const auto pad_bottom = info.pad_stride_info.pad_bottom();
+ const auto pad_left = info.pad_stride_info.pad_left();
+ PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
bool supported_padding = is_same_padding || is_valid_padding;
// TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
- bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1));
+ bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
if(weights_type == DataType::QSYMM8_PER_CHANNEL)
{
- is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U));
+ is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
}
- return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
+ return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
}
-void NEDepthwiseConvolutionAssemblyDispatch::run()
+void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors)
{
// Prepare assembly kernel
- prepare();
+ prepare(tensors);
- MemoryGroupResourceScope scope_mg(_memory_group);
+ auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
+ auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
// Setup inputs/outputs
- ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
- _pImpl->_dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
-
- ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
- const int input_element_size = _input->info()->element_size();
- const int input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
- const int input_row_stride = _input->info()->strides_in_bytes().z() / input_element_size;
- const int input_col_stride = _input->info()->strides_in_bytes().y() / input_element_size;
- const void *input_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes();
- _pImpl->_dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
-
- ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
- const int output_element_size = _output->info()->element_size();
- const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
- const int output_row_stride = _output->info()->strides_in_bytes().z() / output_element_size;
- const int output_col_stride = _output->info()->strides_in_bytes().y() / output_element_size;
- void *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes();
- _pImpl->_dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+ ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
+ _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
+
+ ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
+ const int input_element_size = src->info()->element_size();
+ const int input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
+ const int input_row_stride = src->info()->strides_in_bytes().z() / input_element_size;
+ const int input_col_stride = src->info()->strides_in_bytes().y() / input_element_size;
+ const void *input_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
+ _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+ ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
+ const int output_element_size = dst->info()->element_size();
+ const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
+ const int output_row_stride = dst->info()->strides_in_bytes().z() / output_element_size;
+ const int output_col_stride = dst->info()->strides_in_bytes().y() / output_element_size;
+ void *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+ _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
// Schedule assembly kernel
- NEScheduler::get().schedule(&_pImpl->_dwc_acl_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
}
-void NEDepthwiseConvolutionAssemblyDispatch::prepare()
+void CpuDepthwiseConvolutionAssemblyDispatch::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if(!_pImpl->is_prepared)
{
- _packed_weights.allocator()->allocate();
- ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+ ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
// Pack weights and bias
- const int weights_element_size = _weights->info()->element_size();
- const int weights_row_stride = _weights->info()->strides_in_bytes().z() / weights_element_size;
- const int weights_col_stride = _weights->info()->strides_in_bytes().y() / weights_element_size;
- _pImpl->_dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
- _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
- weights_row_stride,
- weights_col_stride,
- (_bias != nullptr) ? _bias->buffer() : nullptr);
- _pImpl->_dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
-
- _weights->mark_as_unused();
- if(_bias != nullptr)
+ const int weights_element_size = weights->info()->element_size();
+ const int weights_row_stride = weights->info()->strides_in_bytes().z() / weights_element_size;
+ const int weights_col_stride = weights->info()->strides_in_bytes().y() / weights_element_size;
+ _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
+ weights->buffer() + weights->info()->offset_first_element_in_bytes(),
+ weights_row_stride,
+ weights_col_stride,
+ (bias != nullptr) ? bias->buffer() : nullptr);
+ _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
+
+ weights->mark_as_unused();
+ if(bias != nullptr)
{
- _bias->mark_as_unused();
+ bias->mark_as_unused();
}
- _is_prepared = true;
+ _pImpl->is_prepared = true;
}
}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
new file mode 100644
index 0000000000..6aac74c3ef
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Depthwise convolution assembly kernel glue */
+class CpuDepthwiseConvolutionAssemblyDispatch : public ICpuOperator
+{
+public:
+ CpuDepthwiseConvolutionAssemblyDispatch();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConvolutionAssemblyDispatch(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
+ /** Default move constructor */
+ CpuDepthwiseConvolutionAssemblyDispatch(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConvolutionAssemblyDispatch &operator=(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete;
+ /** Default move assignment operator */
+ CpuDepthwiseConvolutionAssemblyDispatch &operator=(CpuDepthwiseConvolutionAssemblyDispatch &&) = default;
+ /** Default destructor */
+ ~CpuDepthwiseConvolutionAssemblyDispatch();
+ /** Initialize the function's source, destination, kernels and border_size.
+ *
+ * @note Supports only NHWC format
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+ * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const ConvolutionInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionAssemblyDispatch
+ *
+ * @note Supports only NHWC format
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+ * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return An error status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const ConvolutionInfo &info);
+ /** Check if the optimized kernel can be used for the given kernel sizes and strides
+ *
+ * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
+ *
+ * @param[in] input Input tensor info.
+ * @param[in] weights Weights tensor info.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
+ */
+ static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo &info);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ struct LocalImpl;
+ std::unique_ptr<LocalImpl> _pImpl;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */