aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorManuel Bottini <manuel.bottini@arm.com>2021-04-13 13:09:30 +0100
committerManuel Bottini <manuel.bottini@arm.com>2021-04-14 14:21:50 +0000
commit327225d3b2f716d5c62d801a7fafc7d377521f34 (patch)
treec19125b74a5ddf9a63e165cbffa7a85b01c7aff1
parent21c28957f9c6fe1a28ef934e711bb7474b8d65ee (diff)
downloadComputeLibrary-327225d3b2f716d5c62d801a7fafc7d377521f34.tar.gz
Port NEDirectConvolutionLayer to new API
Partially resolves: COMPMID-4009 Change-Id: I19ffb61c5c4541134a5028677d2d81228740e454 Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5419 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
-rw-r--r--Android.bp5
-rw-r--r--arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h29
-rw-r--r--docs/00_introduction.dox10
-rw-r--r--src/core/NEON/NEKernels.h2
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h102
-rw-r--r--src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp (renamed from src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp)353
-rw-r--r--src/core/cpu/kernels/CpuDirectConvolutionKernel.h (renamed from src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h)75
-rw-r--r--src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h93
-rw-r--r--src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp (renamed from src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp)150
-rw-r--r--src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp109
-rw-r--r--src/runtime/cpu/operators/CpuDirectConvolution.cpp147
-rw-r--r--src/runtime/cpu/operators/CpuDirectConvolution.h121
12 files changed, 693 insertions, 503 deletions
diff --git a/Android.bp b/Android.bp
index 7e2da8fe53..17281a49d1 100644
--- a/Android.bp
+++ b/Android.bp
@@ -176,8 +176,6 @@ cc_library_static {
"src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp",
"src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
"src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp",
- "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp",
- "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp",
"src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp",
"src/core/NEON/kernels/NEFFTRadixStageKernel.cpp",
"src/core/NEON/kernels/NEFFTScaleKernel.cpp",
@@ -301,6 +299,8 @@ cc_library_static {
"src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
"src/core/cpu/kernels/CpuCopyKernel.cpp",
"src/core/cpu/kernels/CpuDequantizationKernel.cpp",
+ "src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp",
+ "src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp",
"src/core/cpu/kernels/CpuElementwiseKernel.cpp",
"src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
"src/core/cpu/kernels/CpuFillKernel.cpp",
@@ -630,6 +630,7 @@ cc_library_static {
"src/runtime/cpu/operators/CpuConcatenate.cpp",
"src/runtime/cpu/operators/CpuCopy.cpp",
"src/runtime/cpu/operators/CpuDequantization.cpp",
+ "src/runtime/cpu/operators/CpuDirectConvolution.cpp",
"src/runtime/cpu/operators/CpuElementwise.cpp",
"src/runtime/cpu/operators/CpuElementwiseUnary.cpp",
"src/runtime/cpu/operators/CpuFill.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 86914fa0bc..fc4017e635 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -28,24 +28,18 @@
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace arm_compute
{
-class NEDirectConvolutionLayerOutputStageKernel;
-class NEDirectConvolutionLayerKernel;
-class NEFillBorderKernel;
-
+class ITensor;
+class ITensorInfo;
/** Function to run the direct convolution.
*
- * This function calls the following kernels:
+ * This function calls the following:
*
- * -# @ref NEFillBorderKernel for the input
- * -# @ref NEDirectConvolutionLayerOutputStageKernel
- * -# @ref NEDirectConvolutionLayerKernel
+ * -# @ref cpu::CpuDirectConvolution
*/
class NEDirectConvolutionLayer : public IFunction
{
@@ -108,16 +102,9 @@ public:
void run() override;
private:
- MemoryGroup _memory_group;
- std::unique_ptr<NEDirectConvolutionLayerOutputStageKernel> _output_stage_kernel;
- std::unique_ptr<NEDirectConvolutionLayerKernel> _conv_kernel;
- std::unique_ptr<NEFillBorderKernel> _input_border_handler;
- NEActivationLayer _activationlayer_function;
- Tensor _accumulator;
- bool _has_bias;
- bool _is_activationlayer_enabled;
- unsigned int _dim_split;
- bool _is_padding_required;
+ struct Impl;
+ std::shared_ptr<IMemoryManager> _memory_manager;
+ std::unique_ptr<Impl> _impl;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 9f6af6da50..efc2963f6e 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -259,7 +259,7 @@ v20.11 Public major release
- NENonMaximaSuppression3x3Kernel
- @ref NERemapKernel
- @ref NEGEMMInterleave4x4Kernel
- - @ref NEDirectConvolutionLayerKernel
+ - NEDirectConvolutionLayerKernel
- NEScaleKernel
- NELocallyConnectedMatrixMultiplyKernel
- @ref NEGEMMLowpOffsetContributionKernel
@@ -269,7 +269,7 @@ v20.11 Public major release
- @ref NEDepthwiseConvolutionLayerNativeKernel
- @ref NEGEMMLowpMatrixMultiplyKernel
- @ref NEGEMMMatrixMultiplyKernel
- - @ref NEDirectConvolutionLayerOutputStageKernel
+ - NEDirectConvolutionLayerOutputStageKernel
- @ref NEReductionOperationKernel
- @ref NEGEMMLowpMatrixAReductionKernel
- @ref NEGEMMLowpMatrixBReductionKernel
@@ -682,7 +682,7 @@ v20.02 Public major release
- @ref NEConvolutionLayer
- @ref NEDepthwiseConvolutionLayer
- NEDepthwiseConvolutionLayer3x3Kernel
- - @ref NEDirectConvolutionLayerOutputStageKernel
+ - NEDirectConvolutionLayerOutputStageKernel
- @ref NEElementwiseComparison
- @ref NEElementwiseMax
- @ref NEElementwiseMin
@@ -1214,7 +1214,7 @@ v18.01 Public maintenance release
- GCGEMMTranspose1xWKernel
- GCIm2ColKernel
- Refactored Arm® Neon™ Winograd (NEWinogradLayerKernel)
- - Added @ref NEDirectConvolutionLayerOutputStageKernel
+ - Added NEDirectConvolutionLayerOutputStageKernel
- Added QASYMM8 support to the following Arm® Neon™ kernels:
- NEDepthwiseConvolutionLayer3x3Kernel
- @ref NEFillBorderKernel
@@ -1338,7 +1338,7 @@ v17.06 Public major release
- New Arm® Neon™ kernels / functions:
- @ref NEBatchNormalizationLayerKernel / @ref NEBatchNormalizationLayer
- NEDepthConcatenateLayerKernel / NEDepthConcatenateLayer
- - @ref NEDirectConvolutionLayerKernel / @ref NEDirectConvolutionLayer
+ - NEDirectConvolutionLayerKernel / @ref NEDirectConvolutionLayer
- NELocallyConnectedMatrixMultiplyKernel / NELocallyConnectedLayer
- @ref NEWeightsReshapeKernel / @ref NEConvolutionLayerReshapeWeights
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 59884e2d05..264f521be2 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -39,8 +39,6 @@
#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
deleted file mode 100644
index 8f7eeb05b2..0000000000
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @note We assume bias to be shared
- * @note For quantized computations (i.e. @p input of S32 type) the output data type for auto-initialization must be passed as part
- * of the @ref DirectConvolutionLayerOutputStageKernelInfo.
- */
-class NEDirectConvolutionLayerOutputStageKernel : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NEDirectConvolutionLayerOutputStageKernel";
- }
- /** Default constructor */
- NEDirectConvolutionLayerOutputStageKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDirectConvolutionLayerOutputStageKernel(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDirectConvolutionLayerOutputStageKernel &operator=(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEDirectConvolutionLayerOutputStageKernel(NEDirectConvolutionLayerOutputStageKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEDirectConvolutionLayerOutputStageKernel &operator=(NEDirectConvolutionLayerOutputStageKernel &&) = default;
- /** Default destructor */
- ~NEDirectConvolutionLayerOutputStageKernel() = default;
- /** Set the accumulate buffer and the biases of the kernel.
- *
- * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: F16/F32/S32
- * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
- * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
- * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
- * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
- * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
- */
- void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr,
- const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
- *
- * @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: F16/F32/S32
- * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
- * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
- * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
- * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
- * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr,
- const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias);
-
-private:
- OutputStageKernel *_func;
- ITensor *_input;
- const ITensor *_bias;
- ITensor *_output;
- int _result_fixedpoint_multiplier;
- int _result_shift;
- int _result_offset_after_shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
index 98b76c7db3..4f46eb2bf6 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
#include "src/core/NEON/wrapper/wrapper.h"
@@ -46,6 +46,10 @@ using namespace arm_compute::detail;
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -154,19 +158,19 @@ template <unsigned int stridex>
class convolver_w1x1_i8x8_f32
{
public:
- static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > small_tensor_size_optim);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > small_tensor_size_optim);
-
- const int input_stride_x = input->info()->strides_in_bytes().x();
- const int input_stride_y = input->info()->strides_in_bytes().y();
- const int input_stride_z = input->info()->strides_in_bytes().z();
- const int output_stride_y = output->info()->strides_in_bytes().y();
- const int output_stride_z = output->info()->strides_in_bytes().z();
+ ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
+ ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
+
+ const int input_stride_x = src->info()->strides_in_bytes().x();
+ const int input_stride_y = src->info()->strides_in_bytes().y();
+ const int input_stride_z = src->info()->strides_in_bytes().z();
+ const int output_stride_y = dst->info()->strides_in_bytes().y();
+ const int output_stride_z = dst->info()->strides_in_bytes().z();
const int kernel_stride_z = weights->info()->strides_in_bytes().z();
const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_h = output->info()->dimension(1);
+ const int output_h = dst->info()->dimension(1);
const int range_z = window.z().end() - window.z().start();
const int kernel_depth = weights->info()->dimension(Window::DimZ);
const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
@@ -175,8 +179,8 @@ public:
// setup output window for the iterator
Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
// setup input window for the iterator
@@ -187,8 +191,8 @@ public:
window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
Window window_k = calculate_max_window(*weights->info(), Steps(1u));
- Iterator out(output, window_out);
- Iterator in(input, window_in);
+ Iterator out(dst, window_out);
+ Iterator in(src, window_in);
Iterator k(weights, window_k);
const uint8_t *k_ptr = k.ptr();
@@ -237,17 +241,17 @@ class convolver_1x1
{
public:
static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
- const int input_stride_x = input->info()->strides_in_bytes().x();
- const int input_stride_y = input->info()->strides_in_bytes().y();
- const int input_stride_z = input->info()->strides_in_bytes().z();
- const int output_stride_y = output->info()->strides_in_bytes().y();
- const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int input_stride_x = src->info()->strides_in_bytes().x();
+ const int input_stride_y = src->info()->strides_in_bytes().y();
+ const int input_stride_z = src->info()->strides_in_bytes().z();
+ const int output_stride_y = dst->info()->strides_in_bytes().y();
+ const int output_stride_z = dst->info()->strides_in_bytes().z();
const int kernel_stride_z = weights->info()->strides_in_bytes().z();
const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = output->info()->dimension(0);
- const int output_h = output->info()->dimension(1);
+ const int output_w = dst->info()->dimension(0);
+ const int output_h = dst->info()->dimension(1);
const int range_z = window.z().end() - window.z().start();
const int kernel_depth = weights->info()->dimension(Window::DimZ);
const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
@@ -256,8 +260,8 @@ public:
// setup output window for the iterator
Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
// setup input window for the iterator
@@ -268,8 +272,8 @@ public:
window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
Window window_k = calculate_max_window(*weights->info(), Steps(1u));
- Iterator out(output, window_out);
- Iterator in(input, window_in);
+ Iterator out(dst, window_out);
+ Iterator in(src, window_in);
Iterator k(weights, window_k);
const uint8_t *k_ptr = k.ptr();
@@ -480,20 +484,20 @@ class convolver_3x3
{
public:
static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
- const int input_stride_x = input->info()->strides_in_bytes().x();
- const int input_stride_y = input->info()->strides_in_bytes().y();
- const int input_stride_z = input->info()->strides_in_bytes().z();
- const int output_stride_y = output->info()->strides_in_bytes().y();
- const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int input_stride_x = src->info()->strides_in_bytes().x();
+ const int input_stride_y = src->info()->strides_in_bytes().y();
+ const int input_stride_z = src->info()->strides_in_bytes().z();
+ const int output_stride_y = dst->info()->strides_in_bytes().y();
+ const int output_stride_z = dst->info()->strides_in_bytes().z();
const int kernel_stride_x = weights->info()->strides_in_bytes().x();
const int kernel_stride_y = weights->info()->strides_in_bytes().y();
const int kernel_stride_z = weights->info()->strides_in_bytes().z();
const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = output->info()->dimension(0);
- const int output_h = output->info()->dimension(1);
+ const int output_w = dst->info()->dimension(0);
+ const int output_h = dst->info()->dimension(1);
const int num_planes_z = window.z().end() - window.z().start();
const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
const int kernel_depth = weights->info()->dimension(Window::DimZ);
@@ -503,8 +507,8 @@ public:
// setup output window for the iterator
Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
// setup input window for the iterator
@@ -516,8 +520,8 @@ public:
Window window_k = calculate_max_window(*weights->info(), Steps(1u));
- Iterator out(output, window_out);
- Iterator in(input, window_in);
+ Iterator out(dst, window_out);
+ Iterator in(src, window_in);
Iterator k(weights, window_k);
const uint8_t *k_ptr = k.ptr();
@@ -601,20 +605,20 @@ class convolver_5x5
{
public:
static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
- const int input_stride_x = input->info()->strides_in_bytes().x();
- const int input_stride_y = input->info()->strides_in_bytes().y();
- const int input_stride_z = input->info()->strides_in_bytes().z();
- const int output_stride_y = output->info()->strides_in_bytes().y();
- const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int input_stride_x = src->info()->strides_in_bytes().x();
+ const int input_stride_y = src->info()->strides_in_bytes().y();
+ const int input_stride_z = src->info()->strides_in_bytes().z();
+ const int output_stride_y = dst->info()->strides_in_bytes().y();
+ const int output_stride_z = dst->info()->strides_in_bytes().z();
const int kernel_stride_x = weights->info()->strides_in_bytes().x();
const int kernel_stride_y = weights->info()->strides_in_bytes().y();
const int kernel_stride_z = weights->info()->strides_in_bytes().z();
const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = output->info()->dimension(0);
- const int output_h = output->info()->dimension(1);
+ const int output_w = dst->info()->dimension(0);
+ const int output_h = dst->info()->dimension(1);
const int num_planes_z = window.z().end() - window.z().start();
const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
const int kernel_depth = weights->info()->dimension(Window::DimZ);
@@ -624,8 +628,8 @@ public:
// setup output window for the iterator
Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
// setup input window for the iterator
@@ -637,8 +641,8 @@ public:
Window window_k = calculate_max_window(*weights->info(), Steps(1u));
- Iterator out(output, window_out);
- Iterator in(input, window_in);
+ Iterator out(dst, window_out);
+ Iterator in(src, window_in);
Iterator k(weights, window_k);
const uint8_t *k_ptr = k.ptr();
@@ -720,19 +724,19 @@ float vreduce(const float32x4_t &v)
template <typename T1, typename T2>
inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
switch(conv_stride_x)
{
case 1:
- convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 2:
- convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 3:
- convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
@@ -741,21 +745,21 @@ inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_i
template <>
inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- if(run_optim_small_tensor(input))
+ if(run_optim_small_tensor(src))
{
switch(conv_stride_x)
{
case 1:
- convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);
+ convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
break;
case 2:
- convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);
+ convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
break;
case 3:
- convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);
+ convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
@@ -766,13 +770,13 @@ inline void convolve_1x1<float, float>(const Window &window, unsigned int num_el
switch(conv_stride_x)
{
case 1:
- convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 2:
- convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 3:
- convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
@@ -782,19 +786,19 @@ inline void convolve_1x1<float, float>(const Window &window, unsigned int num_el
template <typename T1, typename T2>
inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
switch(conv_stride_x)
{
case 1:
- convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 2:
- convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 3:
- convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
@@ -803,72 +807,72 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_i
template <typename T1, typename T2>
inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
switch(conv_stride_x)
{
case 1:
- convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 2:
- convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
case 3:
- convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
}
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
- const DataLayout data_layout = input->data_layout();
+ const DataLayout data_layout = src->data_layout();
const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != input->dimension(channel_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (input->data_type() == DataType::F16));
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if(dst->total_size() != 0)
{
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
- DataType data_type = input->data_type();
+ DataType data_type = src->data_type();
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
{
- ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
- const DataLayout data_layout = input->data_layout();
+ const DataLayout data_layout = src->data_layout();
const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
// Calculate right and bottom border
unsigned int kernel_size = weights->dimension(width_idx);
const int conv_stride_x = std::get<0>(conv_info.stride());
const int conv_stride_y = std::get<1>(conv_info.stride());
- const int input_width = input->dimension(width_idx);
+ const int input_width = src->dimension(width_idx);
Window win{};
bool window_changed = false;
@@ -879,7 +883,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
{
case 1:
{
- switch(input->data_type())
+ switch(src->data_type())
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
@@ -887,7 +891,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
- if(run_optim_small_tensor_info(input))
+ if(run_optim_small_tensor_info(src))
{
num_elems_written_per_iteration = 8;
}
@@ -905,7 +909,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
break;
}
case 3:
- switch(input->data_type())
+ switch(src->data_type())
{
case DataType::F32:
num_weight_elems_read_per_row = 4 + kernel_size - 1;
@@ -926,7 +930,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
break;
case 5:
{
- switch(input->data_type())
+ switch(src->data_type())
{
case DataType::F32:
num_weight_elems_read_per_row = 4 + kernel_size - 1;
@@ -948,7 +952,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Calculate right pad
int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
- int end_x = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
+ int end_x = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
// Calculate border
@@ -963,35 +967,35 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
border_size.bottom = conv_pad_bottom;
// Configure window
- win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+ win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
- AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
+ AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
num_elems_read_per_iteration, kernel_size,
conv_stride_x, conv_stride_y);
AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
- AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+ AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
}
else
{
// Configure window NHWC without any padding
- win = calculate_max_window(*output, Steps());
+ win = calculate_max_window(*dst, Steps());
}
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
-bool have_zero_x_internal_padding(ITensorInfo *input, ITensorInfo *weights)
+bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights)
{
- return (input->padding().left == 0 && weights->padding().left == 0 && input->padding().right == 0 && weights->padding().right == 0);
+ return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
}
} // namespace
template <typename T>
-void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &window)
+void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
{
// This function assumes that input and weights have not padding in channel
@@ -1001,19 +1005,19 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo
using tag_type = typename vtype::tag_type;
// Scalar quantities
- const int element_size = _input->info()->element_size();
- const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
- const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
- const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
- const int input_dim_w = _input->info()->dimension(1);
- const int input_dim_h = _input->info()->dimension(2);
+ const int element_size = src->info()->element_size();
+ const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+ const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+ const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+ const int input_dim_w = src->info()->dimension(1);
+ const int input_dim_h = src->info()->dimension(2);
- const int output_stride_c = _output->info()->strides_in_bytes().x();
+ const int output_stride_c = dst->info()->strides_in_bytes().x();
- const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
- const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
- const int kernel_dim_w = _weights->info()->dimension(1);
- const int kernel_dim_h = _weights->info()->dimension(2);
+ const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
+ const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
+ const int kernel_dim_w = weights->info()->dimension(1);
+ const int kernel_dim_h = weights->info()->dimension(2);
const int conv_pad_top = _conv_info.pad_top();
const int conv_pad_left = _conv_info.pad_left();
@@ -1025,13 +1029,13 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
// Setup input window for the weights iterator
- Window window_w = calculate_max_window(*_weights->info(), Steps());
+ Window window_w = calculate_max_window(*weights->info(), Steps());
window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
- Iterator out(_output, window_out);
- Iterator wei(_weights, window_w);
+ Iterator out(dst, window_out);
+ Iterator wei(weights, window_w);
constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
/*
@@ -1079,7 +1083,7 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo
* As a reminder, the batches of the weights are translated into the
* channels of the output
*/
- const T *in_ptr_row = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes())
+ const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
+ id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
@@ -1112,7 +1116,7 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo
}
template <typename T>
-void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)
+void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
{
// Declare useful types
using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -1120,19 +1124,19 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)
using tag_type = typename vtype::tag_type;
// Scalar quantities
- const int element_size = _input->info()->element_size();
- const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
- const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
- const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
- const int input_dim_w = _input->info()->dimension(1);
- const int input_dim_h = _input->info()->dimension(2);
+ const int element_size = src->info()->element_size();
+ const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+ const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+ const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+ const int input_dim_w = src->info()->dimension(1);
+ const int input_dim_h = src->info()->dimension(2);
- const int output_stride_c = _output->info()->strides_in_bytes().x();
+ const int output_stride_c = dst->info()->strides_in_bytes().x();
- const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
- const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
- const int kernel_dim_w = _weights->info()->dimension(1);
- const int kernel_dim_h = _weights->info()->dimension(2);
+ const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
+ const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
+ const int kernel_dim_w = weights->info()->dimension(1);
+ const int kernel_dim_h = weights->info()->dimension(2);
const int conv_pad_top = _conv_info.pad_top();
const int conv_pad_left = _conv_info.pad_left();
@@ -1144,13 +1148,13 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
// Setup input window for the weights iterator
- Window window_w = calculate_max_window(*_weights->info(), Steps());
+ Window window_w = calculate_max_window(*weights->info(), Steps());
window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
- Iterator out(_output, window_out);
- Iterator wei(_weights, window_w);
+ Iterator out(dst, window_out);
+ Iterator wei(weights, window_w);
constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
@@ -1174,8 +1178,8 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)
const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
- const int index_c_end = _weights->info()->dimension(0);
- const T *const in_ptr_start = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+ const int index_c_end = weights->info()->dimension(0);
+ const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
execute_window_loop(window_w, [&](const Coordinates & id_w)
{
@@ -1215,27 +1219,18 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)
out);
}
-NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
- : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),
- _num_elems_written_per_iteration(0), _data_layout()
-{
-}
-
-BorderSize NEDirectConvolutionLayerKernel::border_size() const
+BorderSize CpuDirectConvolutionKernel::border_size() const
{
return _border_size;
}
-void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- _input = input;
- _weights = weights;
- _output = output;
_conv_info = conv_info;
- _data_layout = _input->info()->data_layout();
- _kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
+ _data_layout = src->data_layout();
+ _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
const unsigned int conv_pad_left = conv_info.pad_left();
const unsigned int conv_pad_top = conv_info.pad_top();
@@ -1251,33 +1246,33 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
}
// Get convolved dimensions
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
- DataType data_type = input->info()->data_type();
+ DataType data_type = src->data_type();
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, data_type);
+ auto_init_if_empty(*dst, output_shape, 1, data_type);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row,
+ auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
_num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ ICpuKernel::configure(win_config.second);
}
-Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
{
unsigned int num_weight_elems_read_per_row = 0;
unsigned int num_elems_read_per_iteration = 0;
unsigned int num_elems_written_per_iteration = 0;
BorderSize border_size = {};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
weights->clone().get(),
- output->clone().get(),
+ dst->clone().get(),
conv_info,
num_weight_elems_read_per_row,
num_elems_read_per_iteration,
@@ -1288,14 +1283,16 @@ Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const
return Status{};
}
-void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)
+void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+ const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
if(_data_layout == DataLayout::NCHW)
{
@@ -1303,14 +1300,14 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
{
case 1:
{
- switch(_input->info()->data_type())
+ switch(src->info()->data_type())
{
case DataType::F32:
- convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
@@ -1321,14 +1318,14 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
}
case 3:
{
- switch(_input->info()->data_type())
+ switch(src->info()->data_type())
{
case DataType::F32:
- convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
@@ -1339,10 +1336,10 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
}
case 5:
{
- switch(_input->info()->data_type())
+ switch(src->info()->data_type())
{
case DataType::F32:
- convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
@@ -1359,17 +1356,17 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
}
else
{
- switch(_input->info()->data_type())
+ switch(src->info()->data_type())
{
case DataType::F32:
{
- if(have_zero_x_internal_padding(_input->info(), _weights->info()))
+ if(have_zero_x_internal_padding(src->info(), weights->info()))
{
- convolve_nhwc_optimized<float>(window);
+ convolve_nhwc_optimized<float>(window, src, weights, dst);
}
else
{
- convolve_nhwc<float>(window);
+ convolve_nhwc<float>(window, src, weights, dst);
}
break;
}
@@ -1379,4 +1376,10 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
}
}
}
+const char *CpuDirectConvolutionKernel::name() const
+{
+ return "CpuDirectConvolutionLayerKernel";
+}
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/src/core/cpu/kernels/CpuDirectConvolutionKernel.h
index 259eb683f6..fb8218394b 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/src/core/cpu/kernels/CpuDirectConvolutionKernel.h
@@ -21,89 +21,80 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
-#include "src/core/NEON/INEKernel.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
namespace arm_compute
{
class ITensor;
-
+namespace cpu
+{
+namespace kernels
+{
/** Interface for the kernel to perform Direct Convolution Layer. */
-class NEDirectConvolutionLayerKernel : public INEKernel
+class CpuDirectConvolutionKernel : public ICpuKernel
{
public:
- const char *name() const override
- {
- return "NEDirectConvolutionLayerKernel";
- }
/** Default constructor */
- NEDirectConvolutionLayerKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
- /** Default destructor */
- ~NEDirectConvolutionLayerKernel() = default;
+ CpuDirectConvolutionKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionKernel);
/** Set the input, weights, and output tensors.
*
* @note: DirectConvolution only works in the following configurations:
* 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
* 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
*
- * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+ * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
* while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
- * @param[out] output Output tensor.
+ * @param[out] dst Output tensor.
* The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
*/
- void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
- /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
+ void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionKernel
*
- * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+ * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
* while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
- * @param[in] output Output tensor.
+ * @param[in] dst Output tensor.
* The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info);
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
// Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
- BorderSize border_size() const override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+ BorderSize border_size() const override;
private:
/* Template function for optimized convolution NHWC */
template <typename T>
- void convolve_nhwc_optimized(const Window &window);
+ void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
/* Template function for convolution NHWC */
template <typename T>
- void convolve_nhwc(const Window &window);
+ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
- const ITensor *_input;
- const ITensor *_weights;
- ITensor *_output;
- PadStrideInfo _conv_info;
- BorderSize _border_size;
- unsigned int _kernel_size;
- unsigned int _num_weight_elems_read_per_row;
- unsigned int _num_elems_read_per_iteration;
- unsigned int _num_elems_written_per_iteration;
- DataLayout _data_layout;
+ PadStrideInfo _conv_info{};
+ BorderSize _border_size{};
+ unsigned int _kernel_size{ 0 };
+ unsigned int _num_weight_elems_read_per_row{ 0 };
+ unsigned int _num_elems_read_per_iteration{ 0 };
+ unsigned int _num_elems_written_per_iteration{ 0 };
+ DataLayout _data_layout{ DataLayout::UNKNOWN };
};
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H */
+#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
new file mode 100644
index 0000000000..9eeab194cb
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
+ *
+ * @note We assume bias to be shared
+ * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
+ * of the @ref DirectConvolutionLayerOutputStageKernelInfo.
+ */
+class CpuDirectConvolutionOutputStageKernel : public ICpuKernel
+{
+public:
+ /** Default constructor */
+ CpuDirectConvolutionOutputStageKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionOutputStageKernel);
+ /** Set the accumulate buffer and the biases of the kernel.
+ *
+ * @param[in, out] src Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+ * Data type supported: F16/F32/S32
+ * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
+ * @param[out] dst (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+ * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
+ * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
+ * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+ */
+ void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
+ const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionOutputStageKernel
+ *
+ * @param[in] src Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+ * Data type supported: F16/F32/S32
+ * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
+ * @param[in] dst (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+ * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
+ * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
+ * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
+ const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+
+private:
+ using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
+
+ OutputStageKernel *_func{ nullptr };
+ int _result_fixedpoint_multiplier{ 0 };
+ int _result_shift{ 0 };
+ int _result_offset_after_shift{ 0 };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp
index 3597045bd5..d955b0b461 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -44,42 +44,46 @@
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
const DirectConvolutionLayerOutputStageKernelInfo &info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
if(bias != nullptr)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
}
- if(input->data_type() == DataType::S32)
+ if(src->data_type() == DataType::S32)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output == nullptr, "In-place computation not allowed for quantized output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
}
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if((dst != nullptr) && (dst->total_size() != 0))
{
- if(is_data_type_float(input->data_type()))
+ if(is_data_type_float(src->data_type()))
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
}
- else if(input->data_type() == DataType::S32)
+ else if(src->data_type() == DataType::S32)
{
// In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
@@ -90,25 +94,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
template <typename T>
typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
+output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
+ const bool has_bias = bias != nullptr;
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
- ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
ARM_COMPUTE_UNUSED(result_shift);
ARM_COMPUTE_UNUSED(result_offset_after_shift);
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
- const int window_step_x = 16 / input->info()->element_size();
+ const int window_step_x = 16 / src->info()->element_size();
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator in(input, win);
- Iterator out(output, win);
+ Iterator in(src, win);
+ Iterator out(dst, win);
execute_window_loop(win, [&](const Coordinates & id)
{
int x = window_start_x;
@@ -151,9 +156,10 @@ output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITe
template <typename T>
typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
+output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
+ const bool has_bias = bias != nullptr;
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
ARM_COMPUTE_UNUSED(result_shift);
ARM_COMPUTE_UNUSED(result_offset_after_shift);
@@ -166,13 +172,13 @@ output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITe
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
- const int window_step_x = 16 / input->info()->element_size();
+ const int window_step_x = 16 / src->info()->element_size();
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator in(input, win);
+ Iterator in(src, win);
Iterator bi(bias, window_bias);
- Iterator out(output, win);
+ Iterator out(dst, win);
execute_window_loop(win, [&](const Coordinates &)
{
@@ -216,11 +222,12 @@ output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITe
// Quantized case
template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
+void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
- using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
- using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+ const bool has_bias = bias != nullptr;
+ using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+ using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
@@ -229,12 +236,12 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
- const int window_step_x = 16 / input->info()->element_size();
+ const int window_step_x = 16 / src->info()->element_size();
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator in(input, win);
- Iterator out(output, win);
+ Iterator in(src, win);
+ Iterator out(dst, win);
execute_window_loop(win, [&](const Coordinates & id)
{
@@ -295,11 +302,12 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window
in, out);
}
template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
+void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
- using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
- using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+ const bool has_bias = bias != nullptr;
+ using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+ using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
@@ -314,13 +322,13 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
- const int window_step_x = 16 / input->info()->element_size();
+ const int window_step_x = 16 / src->info()->element_size();
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator in(input, win);
+ Iterator in(src, win);
Iterator bi(bias, window_bias);
- Iterator out(output, win);
+ Iterator out(dst, win);
execute_window_loop(win, [&](const Coordinates &)
{
@@ -377,45 +385,38 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
}
} // namespace
-NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
- : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0)
-{
-}
-
-void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output,
- const DirectConvolutionLayerOutputStageKernelInfo &info)
+void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+ const DirectConvolutionLayerOutputStageKernelInfo &info)
{
+ ARM_COMPUTE_UNUSED(bias);
// Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(), info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
_func = nullptr;
- _bias = bias;
- _input = input;
- _output = (output != nullptr) ? output : input;
_result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
_result_shift = info.result_shift;
_result_offset_after_shift = info.result_offset_after_shift;
// Auto-initialize output output if required
- if(output != nullptr && output->info() != nullptr)
+ if(dst != nullptr)
{
// Work out expected output data type
- const DataType output_dt = (input->info()->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
+ const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
+ auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
}
- Window win = calculate_max_window(*input->info(), Steps());
+ Window win = calculate_max_window(*src, Steps());
- INEKernel::configure(win);
+ ICpuKernel::configure(win);
- const bool is_qasymm8_signed = (output != nullptr) ? is_data_type_quantized_asymmetric_signed(output->info()->data_type()) : false;
+ const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
// Set appropriate function
- if(input->info()->data_layout() == DataLayout::NCHW)
+ if(src->data_layout() == DataLayout::NCHW)
{
- switch(input->info()->data_type())
+ switch(src->data_type())
{
case DataType::S32:
{
@@ -449,7 +450,7 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
}
else
{
- switch(input->info()->data_type())
+ switch(src->data_type())
{
case DataType::S32:
{
@@ -483,22 +484,31 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
}
}
-Status NEDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- const DirectConvolutionLayerOutputStageKernelInfo &info)
+Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+ const DirectConvolutionLayerOutputStageKernelInfo &info)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
return Status{};
}
-void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const ThreadInfo &info)
+void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- const bool has_bias = _bias != nullptr;
- (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, has_bias);
+ auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
+}
+
+const char *CpuDirectConvolutionOutputStageKernel::name() const
+{
+ return "CpuDirectConvolutionOutputStageKernel";
}
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index a953edc78f..73834381c6 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,107 +27,48 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
namespace arm_compute
{
-NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
+struct NEDirectConvolutionLayer::Impl
+{
+ ITensor *src{ nullptr };
+ const ITensor *weights{ nullptr };
+ const ITensor *bias{ nullptr };
+ ITensor *dst{ nullptr };
+ std::unique_ptr<cpu::CpuDirectConvolution> op{ nullptr };
+};
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
- _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+ : _memory_manager(std::move(memory_manager)), _impl(std::make_unique<Impl>())
{
}
+NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
- _output_stage_kernel = std::make_unique<NEDirectConvolutionLayerOutputStageKernel>();
- _conv_kernel = std::make_unique<NEDirectConvolutionLayerKernel>();
- _input_border_handler = std::make_unique<NEFillBorderKernel>();
-
- // Free accumulator
- if(_accumulator.buffer() != nullptr)
- {
- _accumulator.allocator()->free();
- }
-
- _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
-
- // Check if bias should be added in the convolution result
- _has_bias = (bias != nullptr);
-
- _conv_kernel->configure(input, weights, output, conv_info);
- if(_has_bias)
- {
- _output_stage_kernel->configure(output, bias);
- }
- _is_padding_required = !_conv_kernel->border_size().empty();
-
- if(_is_padding_required)
- {
- // Add zero padding XY
- _input_border_handler->configure(input, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
- }
-
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
+ _impl->src = input;
+ _impl->weights = weights;
+ _impl->bias = bias;
+ _impl->dst = output;
+ _impl->op = std::make_unique<cpu::CpuDirectConvolution>(_memory_manager);
+ _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
}
Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-
- // output might not be initialized since it can be an intermediate tensor of another layer
- DataType data_type = input->data_type();
- TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
-
- // Validate Convolution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
- "Biases size and number of input feature maps should match");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
- }
-
- // Validate bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
-
- if(act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
- }
-
- return Status{};
+ return cpu::CpuDirectConvolution::validate(input, weights, bias, output, conv_info, act_info);
}
void NEDirectConvolutionLayer::run()
{
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_is_padding_required)
- {
- NEScheduler::get().schedule(_input_border_handler.get(), Window::DimZ);
- }
- NEScheduler::get().schedule(_conv_kernel.get(), _dim_split);
- if(_has_bias)
- {
- NEScheduler::get().schedule(_output_stage_kernel.get(), Window::DimY);
- }
-
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, _impl->bias);
+ pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+ _impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.cpp b/src/runtime/cpu/operators/CpuDirectConvolution.cpp
new file mode 100644
index 0000000000..33f79603e8
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDirectConvolution.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDirectConvolution.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuDirectConvolution::~CpuDirectConvolution() = default;
+
+CpuDirectConvolution::CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
+ _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+{
+}
+
+void CpuDirectConvolution::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+ _output_stage_kernel = std::make_unique<kernels::CpuDirectConvolutionOutputStageKernel>();
+ _conv_kernel = std::make_unique<kernels::CpuDirectConvolutionKernel>();
+ _input_border_handler = std::make_unique<NEFillBorderKernel>();
+
+ // Free accumulator
+ if(_accumulator.buffer() != nullptr)
+ {
+ _accumulator.allocator()->free();
+ }
+
+ _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
+ // Check if bias should be added in the convolution result
+ _has_bias = (bias != nullptr);
+
+ _conv_kernel->configure(src, weights, dst, conv_info);
+ if(_has_bias)
+ {
+ _output_stage_kernel->configure(dst, bias);
+ }
+ _is_padding_required = !_conv_kernel->border_size().empty();
+
+ if(_is_padding_required)
+ {
+ // Add zero padding XY
+ _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ }
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function = std::make_unique<CpuActivation>();
+ _activationlayer_function->configure(dst, dst, act_info);
+ }
+}
+
+Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+
+ // output might not be initialized since it can be an intermediate tensor of another layer
+ DataType data_type = src->data_type();
+ TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+ // Validate Convolution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionKernel::validate(src, weights, &accumulator, conv_info));
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+ "Biases size and number of input feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+ }
+
+ // Validate bias kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionOutputStageKernel::validate(&accumulator, bias, dst));
+
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
+ }
+
+ return Status{};
+}
+
+void CpuDirectConvolution::run(ITensorPack &tensors)
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ if(_is_padding_required)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_DST, src);
+ NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+ }
+ NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
+ if(_has_bias)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, dst);
+ pack.add_tensor(TensorType::ACL_SRC_1, bias);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
+ }
+
+ if(_is_activationlayer_enabled)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, dst);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _activationlayer_function->run(pack);
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.h b/src/runtime/cpu/operators/CpuDirectConvolution.h
new file mode 100644
index 0000000000..0635e087fd
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDirectConvolution.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
+#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to run the direct convolution.
+ *
+ * This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref kernels::CpuDirectConvolutionOutputStageKernel
+ * -# @ref kernels::CpuDirectConvolutionKernel
+ */
+class CpuDirectConvolution : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Destructor */
+ ~CpuDirectConvolution();
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @note: DirectConvolution only works in the following configurations:
+ * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+ * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+ * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+ *
+ * @param[in, out] src Input tensor info. Data types supported: F16/F32.
+ * @param[in] weights Set of kernels to convolve the input volume.
+ * Supported sizes: 1x1, 3x3 and 5x5.
+ * The 3rd dimension must be the same as the input's volume 3rd dimension.
+ * Data type supported: Same as @p src.
+ * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src.
+ * @param[out] dst Output tensor info.
+ * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
+ *
+ * @note: DirectConvolution only works in the following configurations:
+ * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+ * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+ * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+ *
+ * @param[in] src Input tensor info. Data types supported: F16/F32.
+ * @param[in] weights Set of kernels to convolve the input volume.
+ * Supported sizes: 1x1, 3x3 and 5x5.
+ * The 3rd dimension must be the same as the input's volume 3rd dimension.
+ * Data type supported: Same as @p src.
+ * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src.
+ * @param[in] dst Output tensor info.
+ * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+
+private:
+ MemoryGroup _memory_group;
+ std::unique_ptr<kernels::CpuDirectConvolutionOutputStageKernel> _output_stage_kernel;
+ std::unique_ptr<kernels::CpuDirectConvolutionKernel> _conv_kernel;
+ std::unique_ptr<NEFillBorderKernel> _input_border_handler;
+ std::unique_ptr<CpuActivation> _activationlayer_function;
+ Tensor _accumulator;
+ bool _has_bias{ false };
+ bool _is_activationlayer_enabled{ false };
+ unsigned int _dim_split{ 0 };
+ bool _is_padding_required{ false };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H */