diff options
author | Manuel Bottini <manuel.bottini@arm.com> | 2021-04-13 13:09:30 +0100 |
---|---|---|
committer | Manuel Bottini <manuel.bottini@arm.com> | 2021-04-14 14:21:50 +0000 |
commit | 327225d3b2f716d5c62d801a7fafc7d377521f34 (patch) | |
tree | c19125b74a5ddf9a63e165cbffa7a85b01c7aff1 /src | |
parent | 21c28957f9c6fe1a28ef934e711bb7474b8d65ee (diff) | |
download | ComputeLibrary-327225d3b2f716d5c62d801a7fafc7d377521f34.tar.gz |
Port NEDirectConvolutionLayer to new API
Partially resolves: COMPMID-4009
Change-Id: I19ffb61c5c4541134a5028677d2d81228740e454
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5419
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/core/NEON/NEKernels.h | 2 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h | 102 | ||||
-rw-r--r-- | src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp (renamed from src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp) | 353 | ||||
-rw-r--r-- | src/core/cpu/kernels/CpuDirectConvolutionKernel.h (renamed from src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h) | 75 | ||||
-rw-r--r-- | src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h | 93 | ||||
-rw-r--r-- | src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp (renamed from src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp) | 150 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp | 109 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuDirectConvolution.cpp | 147 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuDirectConvolution.h | 121 |
9 files changed, 677 insertions, 475 deletions
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index 59884e2d05..264f521be2 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -39,8 +39,6 @@ #include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" #include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h" -#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" -#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" #include "src/core/NEON/kernels/NEFFTScaleKernel.h" diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h deleted file mode 100644 index 8f7eeb05b2..0000000000 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H -#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; -/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input. - * - * @note We assume bias to be shared - * @note For quantized computations (i.e. @p input of S32 type) the output data type for auto-initialization must be passed as part - * of the @ref DirectConvolutionLayerOutputStageKernelInfo. - */ -class NEDirectConvolutionLayerOutputStageKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEDirectConvolutionLayerOutputStageKernel"; - } - /** Default constructor */ - NEDirectConvolutionLayerOutputStageKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDirectConvolutionLayerOutputStageKernel(const NEDirectConvolutionLayerOutputStageKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDirectConvolutionLayerOutputStageKernel &operator=(const NEDirectConvolutionLayerOutputStageKernel &) = delete; - /** Allow instances of this class to be moved */ - NEDirectConvolutionLayerOutputStageKernel(NEDirectConvolutionLayerOutputStageKernel &&) = default; - /** Allow instances of this class to be moved */ - NEDirectConvolutionLayerOutputStageKernel &operator=(NEDirectConvolutionLayerOutputStageKernel &&) = default; - /** Default destructor */ - ~NEDirectConvolutionLayerOutputStageKernel() = default; - /** Set the accumulate buffer and the biases of the kernel. - * - * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: F16/F32/S32 - * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input - * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr. - * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32 - * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata - */ - void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel - * - * @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: F16/F32/S32 - * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input - * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr. - * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32 - * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias); - -private: - OutputStageKernel *_func; - ITensor *_input; - const ITensor *_bias; - ITensor *_output; - int _result_fixedpoint_multiplier; - int _result_shift; - int _result_offset_after_shift; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H */ diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp index 98b76c7db3..4f46eb2bf6 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" +#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h" #include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -46,6 +46,10 @@ using namespace arm_compute::detail; namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -154,19 +158,19 @@ template <unsigned int stridex> class convolver_w1x1_i8x8_f32 { public: - static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > small_tensor_size_optim); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > small_tensor_size_optim); - - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int input_stride_z = input->info()->strides_in_bytes().z(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int output_stride_z = output->info()->strides_in_bytes().z(); + ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim); + ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim); + + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); const int kernel_stride_z = weights->info()->strides_in_bytes().z(); const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_h = output->info()->dimension(1); + const int output_h = dst->info()->dimension(1); const int range_z = window.z().end() - window.z().start(); const int kernel_depth = weights->info()->dimension(Window::DimZ); const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); @@ -175,8 +179,8 @@ public: // setup output window for the iterator Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z)); // setup input window for the iterator @@ -187,8 +191,8 @@ public: window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - Iterator out(output, window_out); - Iterator in(input, window_in); + Iterator out(dst, window_out); + Iterator in(src, window_in); Iterator k(weights, window_k); const uint8_t *k_ptr = k.ptr(); @@ -237,17 +241,17 @@ class convolver_1x1 { public: static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int input_stride_z = input->info()->strides_in_bytes().z(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int output_stride_z = output->info()->strides_in_bytes().z(); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); const int kernel_stride_z = weights->info()->strides_in_bytes().z(); const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = output->info()->dimension(0); - const int output_h = output->info()->dimension(1); + const int output_w = dst->info()->dimension(0); + const int output_h = dst->info()->dimension(1); const int range_z = window.z().end() - window.z().start(); const int kernel_depth = weights->info()->dimension(Window::DimZ); const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); @@ -256,8 +260,8 @@ public: // setup output window for the iterator Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z)); // setup input window for the iterator @@ -268,8 +272,8 @@ public: window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - Iterator out(output, window_out); - Iterator in(input, window_in); + Iterator out(dst, window_out); + Iterator in(src, window_in); Iterator k(weights, window_k); const uint8_t *k_ptr = k.ptr(); @@ -480,20 +484,20 @@ class convolver_3x3 { public: static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int input_stride_z = input->info()->strides_in_bytes().z(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int output_stride_z = output->info()->strides_in_bytes().z(); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); const int kernel_stride_x = weights->info()->strides_in_bytes().x(); const int kernel_stride_y = weights->info()->strides_in_bytes().y(); const int kernel_stride_z = weights->info()->strides_in_bytes().z(); const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = output->info()->dimension(0); - const int output_h = output->info()->dimension(1); + const int output_w = dst->info()->dimension(0); + const int output_h = dst->info()->dimension(1); const int num_planes_z = window.z().end() - window.z().start(); const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); const int kernel_depth = weights->info()->dimension(Window::DimZ); @@ -503,8 +507,8 @@ public: // setup output window for the iterator Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z)); // setup input window for the iterator @@ -516,8 +520,8 @@ public: Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - Iterator out(output, window_out); - Iterator in(input, window_in); + Iterator out(dst, window_out); + Iterator in(src, window_in); Iterator k(weights, window_k); const uint8_t *k_ptr = k.ptr(); @@ -601,20 +605,20 @@ class convolver_5x5 { public: static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int input_stride_z = input->info()->strides_in_bytes().z(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int output_stride_z = output->info()->strides_in_bytes().z(); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); const int kernel_stride_x = weights->info()->strides_in_bytes().x(); const int kernel_stride_y = weights->info()->strides_in_bytes().y(); const int kernel_stride_z = weights->info()->strides_in_bytes().z(); const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = output->info()->dimension(0); - const int output_h = output->info()->dimension(1); + const int output_w = dst->info()->dimension(0); + const int output_h = dst->info()->dimension(1); const int num_planes_z = window.z().end() - window.z().start(); const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); const int kernel_depth = weights->info()->dimension(Window::DimZ); @@ -624,8 +628,8 @@ public: // setup output window for the iterator Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z)); // setup input window for the iterator @@ -637,8 +641,8 @@ public: Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - Iterator out(output, window_out); - Iterator in(input, window_in); + Iterator out(dst, window_out); + Iterator in(src, window_in); Iterator k(weights, window_k); const uint8_t *k_ptr = k.ptr(); @@ -720,19 +724,19 @@ float vreduce(const float32x4_t &v) template <typename T1, typename T2> inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); switch(conv_stride_x) { case 1: - convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 2: - convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 3: - convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; default: ARM_COMPUTE_ERROR("Not implemented"); @@ -741,21 +745,21 @@ inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_i template <> inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - if(run_optim_small_tensor(input)) + if(run_optim_small_tensor(src)) { switch(conv_stride_x) { case 1: - convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info); + convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info); break; case 2: - convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info); + convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info); break; case 3: - convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info); + convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info); break; default: ARM_COMPUTE_ERROR("Not implemented"); @@ -766,13 +770,13 @@ inline void convolve_1x1<float, float>(const Window &window, unsigned int num_el switch(conv_stride_x) { case 1: - convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 2: - convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 3: - convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; default: ARM_COMPUTE_ERROR("Not implemented"); @@ -782,19 +786,19 @@ inline void convolve_1x1<float, float>(const Window &window, unsigned int num_el template <typename T1, typename T2> inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); switch(conv_stride_x) { case 1: - convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 2: - convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 3: - convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; default: ARM_COMPUTE_ERROR("Not implemented"); @@ -803,72 +807,72 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_i template <typename T1, typename T2> inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); switch(conv_stride_x) { case 1: - convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 2: - convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; case 3: - convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); break; default: ARM_COMPUTE_ERROR("Not implemented"); } } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info) +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - const DataLayout data_layout = input->data_layout(); + const DataLayout data_layout = src->data_layout(); const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != input->dimension(channel_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx)); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (input->data_type() == DataType::F16)); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16)); // Checks performed when output is configured - if(output->total_size() != 0) + if(dst->total_size() != 0) { - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info); + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - DataType data_type = input->data_type(); + DataType data_type = src->data_type(); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row, +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row, unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size) { - ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - const DataLayout data_layout = input->data_layout(); + const DataLayout data_layout = src->data_layout(); const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); // Calculate right and bottom border unsigned int kernel_size = weights->dimension(width_idx); const int conv_stride_x = std::get<0>(conv_info.stride()); const int conv_stride_y = std::get<1>(conv_info.stride()); - const int input_width = input->dimension(width_idx); + const int input_width = src->dimension(width_idx); Window win{}; bool window_changed = false; @@ -879,7 +883,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen { case 1: { - switch(input->data_type()) + switch(src->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: @@ -887,7 +891,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: - if(run_optim_small_tensor_info(input)) + if(run_optim_small_tensor_info(src)) { num_elems_written_per_iteration = 8; } @@ -905,7 +909,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen break; } case 3: - switch(input->data_type()) + switch(src->data_type()) { case DataType::F32: num_weight_elems_read_per_row = 4 + kernel_size - 1; @@ -926,7 +930,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen break; case 5: { - switch(input->data_type()) + switch(src->data_type()) { case DataType::F32: num_weight_elems_read_per_row = 4 + kernel_size - 1; @@ -948,7 +952,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen // Calculate right pad int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left()); - int end_x = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x; + int end_x = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x; int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width; // Calculate border @@ -963,35 +967,35 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen border_size.bottom = conv_pad_bottom; // Configure window - win = calculate_max_window(*output, Steps(num_elems_written_per_iteration)); + win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration)); - AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, + AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, kernel_size, conv_stride_x, conv_stride_y); AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size); - AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration); window_changed = update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); } else { // Configure window NHWC without any padding - win = calculate_max_window(*output, Steps()); + win = calculate_max_window(*dst, Steps()); } Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } -bool have_zero_x_internal_padding(ITensorInfo *input, ITensorInfo *weights) +bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights) { - return (input->padding().left == 0 && weights->padding().left == 0 && input->padding().right == 0 && weights->padding().right == 0); + return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0); } } // namespace template <typename T> -void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &window) +void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) { // This function assumes that input and weights have not padding in channel @@ -1001,19 +1005,19 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo using tag_type = typename vtype::tag_type; // Scalar quantities - const int element_size = _input->info()->element_size(); - const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size; - const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size; - const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size; - const int input_dim_w = _input->info()->dimension(1); - const int input_dim_h = _input->info()->dimension(2); + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); - const int output_stride_c = _output->info()->strides_in_bytes().x(); + const int output_stride_c = dst->info()->strides_in_bytes().x(); - const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size; - const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size; - const int kernel_dim_w = _weights->info()->dimension(1); - const int kernel_dim_h = _weights->info()->dimension(2); + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; + const int kernel_dim_w = weights->info()->dimension(1); + const int kernel_dim_h = weights->info()->dimension(2); const int conv_pad_top = _conv_info.pad_top(); const int conv_pad_left = _conv_info.pad_left(); @@ -1025,13 +1029,13 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); // Setup input window for the weights iterator - Window window_w = calculate_max_window(*_weights->info(), Steps()); + Window window_w = calculate_max_window(*weights->info(), Steps()); window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); - Iterator out(_output, window_out); - Iterator wei(_weights, window_w); + Iterator out(dst, window_out); + Iterator wei(weights, window_w); constexpr int num_elems_read_per_iteration = 16 / sizeof(T); /* @@ -1079,7 +1083,7 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo * As a reminder, the batches of the weights are translated into the * channels of the output */ - const T *in_ptr_row = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes()) + const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h; uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; @@ -1112,7 +1116,7 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &windo } template <typename T> -void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window) +void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) { // Declare useful types using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; @@ -1120,19 +1124,19 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window) using tag_type = typename vtype::tag_type; // Scalar quantities - const int element_size = _input->info()->element_size(); - const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size; - const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size; - const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size; - const int input_dim_w = _input->info()->dimension(1); - const int input_dim_h = _input->info()->dimension(2); + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); - const int output_stride_c = _output->info()->strides_in_bytes().x(); + const int output_stride_c = dst->info()->strides_in_bytes().x(); - const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size; - const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size; - const int kernel_dim_w = _weights->info()->dimension(1); - const int kernel_dim_h = _weights->info()->dimension(2); + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; + const int kernel_dim_w = weights->info()->dimension(1); + const int kernel_dim_h = weights->info()->dimension(2); const int conv_pad_top = _conv_info.pad_top(); const int conv_pad_left = _conv_info.pad_left(); @@ -1144,13 +1148,13 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window) window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); // Setup input window for the weights iterator - Window window_w = calculate_max_window(*_weights->info(), Steps()); + Window window_w = calculate_max_window(*weights->info(), Steps()); window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); - Iterator out(_output, window_out); - Iterator wei(_weights, window_w); + Iterator out(dst, window_out); + Iterator wei(weights, window_w); constexpr int num_elems_read_per_iteration = 16 / sizeof(T); @@ -1174,8 +1178,8 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window) const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - const int index_c_end = _weights->info()->dimension(0); - const T *const in_ptr_start = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; + const int index_c_end = weights->info()->dimension(0); + const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; execute_window_loop(window_w, [&](const Coordinates & id_w) { @@ -1215,27 +1219,18 @@ void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window) out); } -NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel() - : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0), - _num_elems_written_per_iteration(0), _data_layout() -{ -} - -BorderSize NEDirectConvolutionLayerKernel::border_size() const +BorderSize CpuDirectConvolutionKernel::border_size() const { return _border_size; } -void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - _input = input; - _weights = weights; - _output = output; _conv_info = conv_info; - _data_layout = _input->info()->data_layout(); - _kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); + _data_layout = src->data_layout(); + _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); const unsigned int conv_pad_left = conv_info.pad_left(); const unsigned int conv_pad_top = conv_info.pad_top(); @@ -1251,33 +1246,33 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens } // Get convolved dimensions - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info); + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - DataType data_type = input->info()->data_type(); + DataType data_type = src->data_type(); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, data_type); + auto_init_if_empty(*dst, output_shape, 1, data_type); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info)); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row, + auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + ICpuKernel::configure(win_config.second); } -Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info) +Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) { unsigned int num_weight_elems_read_per_row = 0; unsigned int num_elems_read_per_iteration = 0; unsigned int num_elems_written_per_iteration = 0; BorderSize border_size = {}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), - output->clone().get(), + dst->clone().get(), conv_info, num_weight_elems_read_per_row, num_elems_read_per_iteration, @@ -1288,14 +1283,16 @@ Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const return Status{}; } -void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info) +void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); if(_data_layout == DataLayout::NCHW) { @@ -1303,14 +1300,14 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo { case 1: { - switch(_input->info()->data_type()) + switch(src->info()->data_type()) { case DataType::F32: - convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -1321,14 +1318,14 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo } case 3: { - switch(_input->info()->data_type()) + switch(src->info()->data_type()) { case DataType::F32: - convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -1339,10 +1336,10 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo } case 5: { - switch(_input->info()->data_type()) + switch(src->info()->data_type()) { case DataType::F32: - convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); break; default: ARM_COMPUTE_ERROR("Data type not supported"); @@ -1359,17 +1356,17 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo } else { - switch(_input->info()->data_type()) + switch(src->info()->data_type()) { case DataType::F32: { - if(have_zero_x_internal_padding(_input->info(), _weights->info())) + if(have_zero_x_internal_padding(src->info(), weights->info())) { - convolve_nhwc_optimized<float>(window); + convolve_nhwc_optimized<float>(window, src, weights, dst); } else { - convolve_nhwc<float>(window); + convolve_nhwc<float>(window, src, weights, dst); } break; } @@ -1379,4 +1376,10 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo } } } +const char *CpuDirectConvolutionKernel::name() const +{ + return "CpuDirectConvolutionLayerKernel"; +} +} // namespace kernels +} // namespace cpu } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/src/core/cpu/kernels/CpuDirectConvolutionKernel.h index 259eb683f6..fb8218394b 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h +++ b/src/core/cpu/kernels/CpuDirectConvolutionKernel.h @@ -21,89 +21,80 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H -#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H +#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H +#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H -#include "src/core/NEON/INEKernel.h" +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" namespace arm_compute { class ITensor; - +namespace cpu +{ +namespace kernels +{ /** Interface for the kernel to perform Direct Convolution Layer. */ -class NEDirectConvolutionLayerKernel : public INEKernel +class CpuDirectConvolutionKernel : public ICpuKernel { public: - const char *name() const override - { - return "NEDirectConvolutionLayerKernel"; - } /** Default constructor */ - NEDirectConvolutionLayerKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete; - /** Allow instances of this class to be moved */ - NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default; - /** Allow instances of this class to be moved */ - NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default; - /** Default destructor */ - ~NEDirectConvolutionLayerKernel() = default; + CpuDirectConvolutionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionKernel); /** Set the input, weights, and output tensors. * * @note: DirectConvolution only works in the following configurations: * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 * - * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. - * @param[out] output Output tensor. + * @param[out] dst Output tensor. * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32 * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. */ - void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info); - /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel + void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionKernel * - * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. - * @param[in] output Output tensor. + * @param[in] dst Output tensor. * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32 * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info); + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info); // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + BorderSize border_size() const override; private: /* Template function for optimized convolution NHWC */ template <typename T> - void convolve_nhwc_optimized(const Window &window); + void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); /* Template function for convolution NHWC */ template <typename T> - void convolve_nhwc(const Window &window); + void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); - const ITensor *_input; - const ITensor *_weights; - ITensor *_output; - PadStrideInfo _conv_info; - BorderSize _border_size; - unsigned int _kernel_size; - unsigned int _num_weight_elems_read_per_row; - unsigned int _num_elems_read_per_iteration; - unsigned int _num_elems_written_per_iteration; - DataLayout _data_layout; + PadStrideInfo _conv_info{}; + BorderSize _border_size{}; + unsigned int _kernel_size{ 0 }; + unsigned int _num_weight_elems_read_per_row{ 0 }; + unsigned int _num_elems_read_per_iteration{ 0 }; + unsigned int _num_elems_written_per_iteration{ 0 }; + DataLayout _data_layout{ DataLayout::UNKNOWN }; }; +} // namespace kernels +} // namespace cpu } // namespace arm_compute -#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H */ +#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h new file mode 100644 index 0000000000..9eeab194cb --- /dev/null +++ b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H +#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input. + * + * @note We assume bias to be shared + * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part + * of the @ref DirectConvolutionLayerOutputStageKernelInfo. + */ +class CpuDirectConvolutionOutputStageKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuDirectConvolutionOutputStageKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionOutputStageKernel); + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] src Input to add the bias to. If @p output is not specified then accumulation is done in-place. + * Data type supported: F16/F32/S32 + * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src + * @param[out] dst (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) + * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr. + * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32 + * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata + */ + void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionOutputStageKernel + * + * @param[in] src Input to add the bias to. If @p output is not specified then accumulation is done in-place. + * Data type supported: F16/F32/S32 + * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src + * @param[in] dst (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) + * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr. + * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32 + * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift); + + OutputStageKernel *_func{ nullptr }; + int _result_fixedpoint_multiplier{ 0 }; + int _result_shift{ 0 }; + int _result_offset_after_shift{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H */ diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp index 3597045bd5..d955b0b461 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h" +#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -44,42 +44,46 @@ namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const DirectConvolutionLayerOutputStageKernelInfo &info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32); if(bias != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL))); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL))); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); } - if(input->data_type() == DataType::S32) + if(src->data_type() == DataType::S32) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output == nullptr, "In-place computation not allowed for quantized output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output"); } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if((dst != nullptr) && (dst->total_size() != 0)) { - if(is_data_type_float(input->data_type())) + if(is_data_type_float(src->data_type())) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } else { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } - else if(input->data_type() == DataType::S32) + else if(src->data_type() == DataType::S32) { // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED)); @@ -90,25 +94,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con template <typename T> typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type -output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias) +output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { + const bool has_bias = bias != nullptr; /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); ARM_COMPUTE_UNUSED(result_shift); ARM_COMPUTE_UNUSED(result_offset_after_shift); const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); - const int window_step_x = 16 / input->info()->element_size(); + const int window_step_x = 16 / src->info()->element_size(); Window win = window; win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator in(input, win); - Iterator out(output, win); + Iterator in(src, win); + Iterator out(dst, win); execute_window_loop(win, [&](const Coordinates & id) { int x = window_start_x; @@ -151,9 +156,10 @@ output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITe template <typename T> typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type -output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias) +output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { + const bool has_bias = bias != nullptr; ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); ARM_COMPUTE_UNUSED(result_shift); ARM_COMPUTE_UNUSED(result_offset_after_shift); @@ -166,13 +172,13 @@ output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITe const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); - const int window_step_x = 16 / input->info()->element_size(); + const int window_step_x = 16 / src->info()->element_size(); Window win = window; win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator in(input, win); + Iterator in(src, win); Iterator bi(bias, window_bias); - Iterator out(output, win); + Iterator out(dst, win); execute_window_loop(win, [&](const Coordinates &) { @@ -216,11 +222,12 @@ output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITe // Quantized case template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 > -void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias) +void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { - using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; - using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); @@ -229,12 +236,12 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); - const int window_step_x = 16 / input->info()->element_size(); + const int window_step_x = 16 / src->info()->element_size(); Window win = window; win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator in(input, win); - Iterator out(output, win); + Iterator in(src, win); + Iterator out(dst, win); execute_window_loop(win, [&](const Coordinates & id) { @@ -295,11 +302,12 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window in, out); } template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 > -void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias) +void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { - using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; - using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); @@ -314,13 +322,13 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); - const int window_step_x = 16 / input->info()->element_size(); + const int window_step_x = 16 / src->info()->element_size(); Window win = window; win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator in(input, win); + Iterator in(src, win); Iterator bi(bias, window_bias); - Iterator out(output, win); + Iterator out(dst, win); execute_window_loop(win, [&](const Coordinates &) { @@ -377,45 +385,38 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window } } // namespace -NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel() - : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0) -{ -} - -void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output, - const DirectConvolutionLayerOutputStageKernelInfo &info) +void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) { + ARM_COMPUTE_UNUSED(bias); // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(), info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); _func = nullptr; - _bias = bias; - _input = input; - _output = (output != nullptr) ? output : input; _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier; _result_shift = info.result_shift; _result_offset_after_shift = info.result_offset_after_shift; // Auto-initialize output output if required - if(output != nullptr && output->info() != nullptr) + if(dst != nullptr) { // Work out expected output data type - const DataType output_dt = (input->info()->data_type() == DataType::S32) ? info.output_data_type : DataType::S32; + const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32; // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt)); + auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt)); } - Window win = calculate_max_window(*input->info(), Steps()); + Window win = calculate_max_window(*src, Steps()); - INEKernel::configure(win); + ICpuKernel::configure(win); - const bool is_qasymm8_signed = (output != nullptr) ? is_data_type_quantized_asymmetric_signed(output->info()->data_type()) : false; + const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; // Set appropriate function - if(input->info()->data_layout() == DataLayout::NCHW) + if(src->data_layout() == DataLayout::NCHW) { - switch(input->info()->data_type()) + switch(src->data_type()) { case DataType::S32: { @@ -449,7 +450,7 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const } else { - switch(input->info()->data_type()) + switch(src->data_type()) { case DataType::S32: { @@ -483,22 +484,31 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const } } -Status NEDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const DirectConvolutionLayerOutputStageKernelInfo &info) +Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); - + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); return Status{}; } -void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const ThreadInfo &info) +void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - const bool has_bias = _bias != nullptr; - (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, has_bias); + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift); +} + +const char *CpuDirectConvolutionOutputStageKernel::name() const +{ + return "CpuDirectConvolutionOutputStageKernel"; } +} // namespace kernels +} // namespace cpu } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index a953edc78f..73834381c6 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,107 +27,48 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" -#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/runtime/cpu/operators/CpuDirectConvolution.h" namespace arm_compute { -NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default; +struct NEDirectConvolutionLayer::Impl +{ + ITensor *src{ nullptr }; + const ITensor *weights{ nullptr }; + const ITensor *bias{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr<cpu::CpuDirectConvolution> op{ nullptr }; +}; NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() + : _memory_manager(std::move(memory_manager)), _impl(std::make_unique<Impl>()) { } +NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default; void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN); - _output_stage_kernel = std::make_unique<NEDirectConvolutionLayerOutputStageKernel>(); - _conv_kernel = std::make_unique<NEDirectConvolutionLayerKernel>(); - _input_border_handler = std::make_unique<NEFillBorderKernel>(); - - // Free accumulator - if(_accumulator.buffer() != nullptr) - { - _accumulator.allocator()->free(); - } - - _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; - - // Check if bias should be added in the convolution result - _has_bias = (bias != nullptr); - - _conv_kernel->configure(input, weights, output, conv_info); - if(_has_bias) - { - _output_stage_kernel->configure(output, bias); - } - _is_padding_required = !_conv_kernel->border_size().empty(); - - if(_is_padding_required) - { - // Add zero padding XY - _input_border_handler->configure(input, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f))); - } - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(output, nullptr, act_info); - } + _impl->src = input; + _impl->weights = weights; + _impl->bias = bias; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuDirectConvolution>(_memory_manager); + _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info); } Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - - // output might not be initialized since it can be an intermediate tensor of another layer - DataType data_type = input->data_type(); - TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); - - // Validate Convolution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info)); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), - "Biases size and number of input feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); - } - - // Validate bias kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output)); - - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); - } - - return Status{}; + return cpu::CpuDirectConvolution::validate(input, weights, bias, output, conv_info, act_info); } void NEDirectConvolutionLayer::run() { - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_is_padding_required) - { - NEScheduler::get().schedule(_input_border_handler.get(), Window::DimZ); - } - NEScheduler::get().schedule(_conv_kernel.get(), _dim_split); - if(_has_bias) - { - NEScheduler::get().schedule(_output_stage_kernel.get(), Window::DimY); - } - - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->bias); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.cpp b/src/runtime/cpu/operators/CpuDirectConvolution.cpp new file mode 100644 index 0000000000..33f79603e8 --- /dev/null +++ b/src/runtime/cpu/operators/CpuDirectConvolution.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuDirectConvolution.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuDirectConvolution::~CpuDirectConvolution() = default; + +CpuDirectConvolution::CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), + _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() +{ +} + +void CpuDirectConvolution::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + _output_stage_kernel = std::make_unique<kernels::CpuDirectConvolutionOutputStageKernel>(); + _conv_kernel = std::make_unique<kernels::CpuDirectConvolutionKernel>(); + _input_border_handler = std::make_unique<NEFillBorderKernel>(); + + // Free accumulator + if(_accumulator.buffer() != nullptr) + { + _accumulator.allocator()->free(); + } + + _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; + + // Check if bias should be added in the convolution result + _has_bias = (bias != nullptr); + + _conv_kernel->configure(src, weights, dst, conv_info); + if(_has_bias) + { + _output_stage_kernel->configure(dst, bias); + } + _is_padding_required = !_conv_kernel->border_size().empty(); + + if(_is_padding_required) + { + // Add zero padding XY + _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f))); + } + + //Configure Activation Layer + _is_activationlayer_enabled = act_info.enabled(); + if(_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<CpuActivation>(); + _activationlayer_function->configure(dst, dst, act_info); + } +} + +Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + + // output might not be initialized since it can be an intermediate tensor of another layer + DataType data_type = src->data_type(); + TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); + + // Validate Convolution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionKernel::validate(src, weights, &accumulator, conv_info)); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), + "Biases size and number of input feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // Validate bias kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionOutputStageKernel::validate(&accumulator, bias, dst)); + + if(act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); + } + + return Status{}; +} + +void CpuDirectConvolution::run(ITensorPack &tensors) +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if(_is_padding_required) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_DST, src); + NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); + } + NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); + if(_has_bias) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, dst); + pack.add_tensor(TensorType::ACL_SRC_1, bias); + pack.add_tensor(TensorType::ACL_DST, dst); + NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); + } + + if(_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.h b/src/runtime/cpu/operators/CpuDirectConvolution.h new file mode 100644 index 0000000000..0635e087fd --- /dev/null +++ b/src/runtime/cpu/operators/CpuDirectConvolution.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H +#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/Tensor.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/core/cpu/ICpuKernel.h" +#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h" +#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h" +#include "src/runtime/cpu/ICpuOperator.h" +#include "src/runtime/cpu/operators/CpuActivation.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Function to run the direct convolution. + * + * This function calls the following kernels: + * + * -# @ref NEFillBorderKernel for the input + * -# @ref kernels::CpuDirectConvolutionOutputStageKernel + * -# @ref kernels::CpuDirectConvolutionKernel + */ +class CpuDirectConvolution : public ICpuOperator +{ +public: + /** Constructor */ + CpuDirectConvolution(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Destructor */ + ~CpuDirectConvolution(); + /** Set the input, weights, biases and output tensors. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 + * + * @param[in, out] src Input tensor info. Data types supported: F16/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * Supported sizes: 1x1, 3x3 and 5x5. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p src. + * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. + * @param[out] dst Output tensor info. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 + * + * @param[in] src Input tensor info. Data types supported: F16/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * Supported sizes: 1x1, 3x3 and 5x5. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p src. + * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. + * @param[in] dst Output tensor info. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<kernels::CpuDirectConvolutionOutputStageKernel> _output_stage_kernel; + std::unique_ptr<kernels::CpuDirectConvolutionKernel> _conv_kernel; + std::unique_ptr<NEFillBorderKernel> _input_border_handler; + std::unique_ptr<CpuActivation> _activationlayer_function; + Tensor _accumulator; + bool _has_bias{ false }; + bool _is_activationlayer_enabled{ false }; + unsigned int _dim_split{ 0 }; + bool _is_padding_required{ false }; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H */ |