From f22f67298d1e7bbc349d5179bceb70971b4bc226 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 3 Jul 2020 16:29:24 +0100 Subject: COMPMID-3532: Align data type support between doxygen and implementation - CPP The patch also removes some unused NEON kernels. Change-Id: I4a7622f31c88ee038b21874614a981764a03122a Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3509 Tested-by: Arm Jenkins Reviewed-by: Sheri Zhang Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- Android.bp | 3 - .../CPPBoxWithNonMaximaSuppressionLimitKernel.h | 6 +- arm_compute/core/CPP/kernels/CPPPermuteKernel.h | 8 +- arm_compute/core/CPP/kernels/CPPTopKVKernel.h | 4 +- arm_compute/core/CPP/kernels/CPPUpsampleKernel.h | 4 +- arm_compute/core/NEON/NEKernels.h | 3 - .../core/NEON/kernels/NEFillInnerBorderKernel.h | 79 ---- .../kernels/NEGEMMMatrixAccumulateBiasesKernel.h | 75 ---- .../kernels/NEGEMMMatrixVectorMultiplyKernel.h | 99 ----- .../CPPBoxWithNonMaximaSuppressionLimit.h | 4 +- arm_compute/runtime/CPP/functions/CPPSplit.h | 4 +- arm_compute/runtime/CPP/functions/CPPTopKV.h | 4 +- arm_compute/runtime/CPP/functions/CPPUpsample.h | 4 +- docs/00_introduction.dox | 6 +- src/core/NEON/kernels/NEFillInnerBorderKernel.cpp | 138 ------- .../kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp | 169 --------- .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp | 410 --------------------- 17 files changed, 21 insertions(+), 999 deletions(-) delete mode 100644 arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h delete mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h delete mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h delete mode 100644 src/core/NEON/kernels/NEFillInnerBorderKernel.cpp delete mode 100644 src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp delete mode 100644 src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp diff --git a/Android.bp b/Android.bp index d19db113d5..b0cacea7b1 100644 --- a/Android.bp +++ b/Android.bp @@ -276,7 +276,6 @@ cc_library_static { "src/core/NEON/kernels/NEFastCornersKernel.cpp", "src/core/NEON/kernels/NEFillArrayKernel.cpp", "src/core/NEON/kernels/NEFillBorderKernel.cpp", - "src/core/NEON/kernels/NEFillInnerBorderKernel.cpp", "src/core/NEON/kernels/NEFlattenLayerKernel.cpp", "src/core/NEON/kernels/NEFloorKernel.cpp", "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp", @@ -289,10 +288,8 @@ cc_library_static { "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp", "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp", "src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp", - "src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp", "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp", "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp", - "src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp", "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp", "src/core/NEON/kernels/NEGatherKernel.cpp", "src/core/NEON/kernels/NEGaussian3x3Kernel.cpp", diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h index 3fa83a6d6d..ac797bdf77 100644 --- a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h +++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H #define ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H -#include "arm_compute/core/IArray.h" -#include "arm_compute/core/IHOG.h" #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" @@ -62,7 +60,7 @@ public: * @param[out] classes The classes output tensor of size [N]. Data types supported: Same as @p scores_in * @param[out] batch_splits_out (Optional) The batch splits output tensor [batch_size]. Data types supported: Same as @p scores_in * @param[out] keeps (Optional) The keeps output tensor of size [N]. Data types supported: Same as@p scores_in - * @param[out] keeps_size (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in + * @param[out] keeps_size (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32 * @param[in] info (Optional) BoxNMSLimitInfo information. */ void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes, diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h index e75152f4ea..6a60849bf3 100644 --- a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h +++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h @@ -56,15 +56,15 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32 - * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] input The input tensor to permute. Data types supported: All. + * @param[out] output The output tensor. Data types supported: same as @p input * @param[in] perm Permutation vector */ void configure(const ITensor *input, ITensor *output, const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32 - * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] input The input tensor to permute. Data types supported: All. + * @param[in] output The output tensor. Data types supported: same as @p input * @param[in] perm Permutation vector * * @return a status diff --git a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h index 4b9bfdd3c9..bba183ce3a 100644 --- a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h +++ b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h @@ -54,7 +54,7 @@ public: /** Set the input and output of the kernel. * * @param[in] predictions A batch_size x classes tensor. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED - * @param[in] targets A batch_size 1D tensor of class ids. Data types supported: S32 + * @param[in] targets A batch_size 1D tensor of class ids. Data types supported: U32 * @param[out] output Computed precision at @p k as a bool 1D tensor. Data types supported: U8 * @param[in] k Number of top elements to look at for computing precision. */ @@ -63,7 +63,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CPPTopKVKernel * * @param[in] predictions A batch_size x classes tensor info. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED - * @param[in] targets A batch_size 1D tensor info of class ids. Data types supported: S32 + * @param[in] targets A batch_size 1D tensor info of class ids. Data types supported: U32 * @param[in] output Computed precision at @p k as a bool 1D tensor info. Data types supported: U8 * @param[in] k Number of top elements to look at for computing precision. * diff --git a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h index 9fbc9b697c..eb23d1a185 100644 --- a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h +++ b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h @@ -55,8 +55,8 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED - * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] input The input tensor to upsample. Data types supported: All. + * @param[out] output The output tensor. Data types supported: same as @p input. * @param[in] info Padding info. */ void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 1c87b11030..1deb64a8de 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -70,7 +70,6 @@ #include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h" #include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h" #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" #include "arm_compute/core/NEON/kernels/NEFloorKernel.h" #include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" @@ -84,10 +83,8 @@ #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" #include "arm_compute/core/NEON/kernels/NEGatherKernel.h" #include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h" diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h deleted file mode 100644 index 9c1059e606..0000000000 --- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H -#define ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the kernel to fill the interior borders */ -class NEFillInnerBorderKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEFillInnerBorderKernel"; - } - /** Default constructor */ - NEFillInnerBorderKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete; - /** Allow instances of this class to be moved */ - NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default; - /** Allow instances of this class to be moved */ - NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default; - /** Default destructor */ - ~NEFillInnerBorderKernel() = default; - - /** Initialise the function. - * - * @note This kernel fills the borders within the XY-planes. - * - * @param[in,out] input Tensor to process. Data types supported: U8/S16/S32/F32. - * @param[in] border_size Size of the border to fill in elements. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. - * - */ - void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue()); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - template - void fill_value_single_channel(const Window &window); - - ITensor *_tensor; - BorderSize _border_size; - PixelValue _constant_border_value; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h deleted file mode 100644 index a3ba57e4ab..0000000000 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H -#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H - -#include "arm_compute/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; -/** NEON kernel to add a bias to each row of the input tensor */ -class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEGEMMMatrixAccumulateBiasesKernel"; - } - /** Default constructor */ - NEGEMMMatrixAccumulateBiasesKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default; - /** Default destructor */ - ~NEGEMMMatrixAccumulateBiasesKernel() = default; - /** Set the accumulate buffer and the biases of the kernel. - * - * @param[in, out] accum The accumulate tensor to convert. Data type supported: F32 - * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input - */ - void configure(ITensor *accum, const ITensor *biases); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel - * - * @param[in] accum The accumulate tensor to convert. Data type supported: F32 - * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input - * - * @return a status - */ - static Status validate(const ITensorInfo *accum, const ITensorInfo *biases); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - ITensor *_accum; - const ITensor *_biases; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h deleted file mode 100644 index f5635dd58c..0000000000 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_ -#define ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_ - -#include "arm_compute/core/NEON/INESimpleKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the GEMM matrix vector multiply kernel. **/ -class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel -{ -public: - const char *name() const override - { - return "NEGEMMMatrixVectorMultiplyKernel"; - } - /** Default constructor */ - NEGEMMMatrixVectorMultiplyKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMMatrixVectorMultiplyKernel(const NEGEMMMatrixVectorMultiplyKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMMatrixVectorMultiplyKernel &operator=(const NEGEMMMatrixVectorMultiplyKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMMatrixVectorMultiplyKernel(NEGEMMMatrixVectorMultiplyKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in] input1 Second Input tensor. Data types supported: same as @p input. - * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input. - */ - void configure(const ITensor *input0, const ITensor *input1, ITensor *output); - - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel - * - * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in] input1 Second Input tensor. Data types supported: same as @p input. - * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - /** Template function to run the matrix vector multiplication - * - * @tparam I0 Input 0 type - * @tparam I1 Input 1 type - * @tparam O Output type - * - * @param[in] window_in Input region. (Must be a valid region of the window returned by window()). - * @param[in] window_w Weights region. (Must be a valid region of the window returned by window()). - * @param[in] window_out Output region.(Must be a valid region of the window returned by window()). - */ - template - void matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out); - /** Common signature for all the specialised matrix vector multiplication functions */ - using GEMMMatrixVectorMultiplyFunctionPtr = void (NEGEMMMatrixVectorMultiplyKernel::*)(const Window &window_in, - const Window &window_w, - const Window &window_out); - -private: - GEMMMatrixVectorMultiplyFunctionPtr _func; - const ITensor *_input0; - const ITensor *_input1; - ITensor *_output; - BorderSize _border_size; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/ diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h index b6f55b5bf8..b1ea84dcda 100644 --- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h +++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h @@ -58,7 +58,7 @@ public: * @param[out] classes The classes output tensor of size [N]. Data types supported: Same as @p scores_in * @param[out] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in * @param[out] keeps (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in - * @param[in] keeps_size (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in + * @param[in] keeps_size (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32. * @param[in] info (Optional) BoxNMSLimitInfo information. */ void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes, @@ -76,7 +76,7 @@ public: * @param[in] classes The classes output tensor of size [N]. Data types supported: Same as @p scores_in * @param[in] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in * @param[in] keeps (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in - * @param[in] keeps_size (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in + * @param[in] keeps_size (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32. * @param[in] info (Optional) BoxNMSLimitInfo information. * * @return a status diff --git a/arm_compute/runtime/CPP/functions/CPPSplit.h b/arm_compute/runtime/CPP/functions/CPPSplit.h index 6adcbc3323..611cc41157 100644 --- a/arm_compute/runtime/CPP/functions/CPPSplit.h +++ b/arm_compute/runtime/CPP/functions/CPPSplit.h @@ -47,8 +47,8 @@ public: } /** Static function to check if given info will lead to a valid configuration of @ref CPPSplit * - * @param[in] input The input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. - * @param[in] outputs A vector containing the output tensors' info. Data types supported: Same as @p input. + * @param[in] input The input tensor info. Data types supported: All. + * @param[in] outputs A vector containing the output tensors' info. Data types supported: same as @p input. * The output tensors should match the input tensor dimensions for all shape dimensions apart * from the split dimension * @param[in] axis Axis on which to split the input. diff --git a/arm_compute/runtime/CPP/functions/CPPTopKV.h b/arm_compute/runtime/CPP/functions/CPPTopKV.h index c94e277312..fb93bdf764 100644 --- a/arm_compute/runtime/CPP/functions/CPPTopKV.h +++ b/arm_compute/runtime/CPP/functions/CPPTopKV.h @@ -39,7 +39,7 @@ public: /** Set the input and output of the kernel. * * @param[in] predictions A batch_size x classes tensor. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED - * @param[in] targets A batch_size 1D tensor of class ids. Data types supported: S32 + * @param[in] targets A batch_size 1D tensor of class ids. Data types supported: U32 * @param[out] output Computed precision at @p k as a bool 1D tensor. Data types supported: U8 * @param[in] k Number of top elements to look at for computing precision. */ @@ -48,7 +48,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CPPTopKVKernel * * @param[in] predictions A batch_size x classes tensor info. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED - * @param[in] targets A batch_size 1D tensor info of class ids. Data types supported: S32 + * @param[in] targets A batch_size 1D tensor info of class ids. Data types supported: U32 * @param[in] output Computed precision at @p k as a bool 1D tensor info. Data types supported: U8 * @param[in] k Number of top elements to look at for computing precision. * diff --git a/arm_compute/runtime/CPP/functions/CPPUpsample.h b/arm_compute/runtime/CPP/functions/CPPUpsample.h index 3e1852bc30..080e86b676 100644 --- a/arm_compute/runtime/CPP/functions/CPPUpsample.h +++ b/arm_compute/runtime/CPP/functions/CPPUpsample.h @@ -38,8 +38,8 @@ class CPPUpsample : public ICPPSimpleFunction public: /** Configure the upsample CPP kernel * - * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED - * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] input The input tensor to upsample. Data types supported: All. + * @param[out] output The output tensor. Data types supported: same as @p input * @param[in] info Padding information */ void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 3668fafbc8..85146cb15d 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -343,7 +343,7 @@ v20.02 Public major release - @ref NEElementwiseMin - @ref NEElementwiseSquaredDiff - @ref NEFullyConnectedLayer - - @ref NEGEMMMatrixVectorMultiplyKernel + - NEGEMMMatrixVectorMultiplyKernel - @ref NEPixelWiseMultiplication - @ref NEPoolingLayer - @ref NEPReluLayer @@ -915,7 +915,7 @@ v17.12 Public major release - New NEON kernels / functions - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore - arm_compute::NEHGEMMAArch64FP16Kernel - - @ref NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / @ref NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer + - @ref NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer - @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - NEWinogradLayer / NEWinogradLayerKernel @@ -1037,7 +1037,7 @@ v17.03.1 First Major public release of the sources - @ref NETransposeKernel / @ref NETranspose - @ref NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer - @ref NEIm2ColKernel, @ref NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer - - @ref NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer + - NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer - @ref NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp v17.03 Sources preview diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp deleted file mode 100644 index 50060b2376..0000000000 --- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include -#include - -using namespace arm_compute; - -namespace arm_compute -{ -class Coordinates; -} // namespace arm_compute - -NEFillInnerBorderKernel::NEFillInnerBorderKernel() - : _tensor(nullptr), _border_size(0), _constant_border_value(static_cast(0.f)) -{ -} - -void NEFillInnerBorderKernel::configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::S32, DataType::F32); - - _tensor = input; - _border_size = border_size; - _constant_border_value = constant_border_value; - - Window win; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - win.set(Window::DimY, Window::Dimension(0, 1, 1)); - win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ); - INEKernel::configure(win); -} - -void NEFillInnerBorderKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - // If there is no border: early exit - if(_border_size.empty()) - { - return; - } - - switch(_tensor->info()->data_type()) - { - case DataType::U8: - fill_value_single_channel(window); - break; - case DataType::S16: - fill_value_single_channel(window); - break; - case DataType::S32: - fill_value_single_channel(window); - break; - case DataType::F32: - static_assert(sizeof(float) == 4, "Float must be 32 bit"); - fill_value_single_channel(window); - break; - default: - ARM_COMPUTE_ERROR("Not handled"); - break; - } -} - -template -void NEFillInnerBorderKernel::fill_value_single_channel(const Window &window) -{ - const size_t stride = _tensor->info()->strides_in_bytes()[1]; - const size_t width = _tensor->info()->dimension(0); - const size_t height = _tensor->info()->dimension(1); - - T constant_border_value; - _constant_border_value.get(constant_border_value); - - // Left and right border - // All X values are set at once - Window vertical(window); - vertical.set(Window::DimY, Window::Dimension(0, height, 1)); - - Iterator vertical_it(_tensor, vertical); - - execute_window_loop(vertical, [&](const Coordinates &) - { - std::fill_n(reinterpret_cast(vertical_it.ptr()), _border_size.left, constant_border_value); - std::fill_n(reinterpret_cast(vertical_it.ptr()) + width - _border_size.right, _border_size.right, constant_border_value); - }, - vertical_it); - - // Top and bottom border - // All values are set at once - Iterator horizontal_it(_tensor, window); - - execute_window_loop(window, [&](const Coordinates &) - { - for(size_t i = 0; i < _border_size.top; ++i) - { - std::fill_n(reinterpret_cast(horizontal_it.ptr() + i * stride), width, constant_border_value); - } - - for(size_t i = 0; i < _border_size.bottom; ++i) - { - std::fill_n(reinterpret_cast(horizontal_it.ptr() + (height - i - 1) * stride), width, constant_border_value); - } - }, - horizontal_it); -} diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp deleted file mode 100644 index 5ac2323896..0000000000 --- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include -#include - -using namespace arm_compute; - -namespace -{ -inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0)); - - return Status{}; -} - -inline std::pair validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases) -{ - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Configure kernel window - Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); - - bool window_changed = update_window_and_padding(win, - AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration), - AccessWindowStatic(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->tensor_shape().y())); - - AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration); - - // Set the valid region for the accum tensor - Coordinates coord; - coord.set_num_dimensions(accum->num_dimensions()); - output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel() - : _accum(nullptr), _biases(nullptr) -{ -} - -void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases); - - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info())); - - _biases = biases; - _accum = accum; - - // Configure kernel window - auto win_config = validate_and_configure_window(accum->info(), biases->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); -} - -Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get()).first); - - return Status{}; -} - -void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - Window win_biases; - win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step())); - win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator in0_out(_accum, window); - Iterator in1(_biases, win_biases); - - switch(_accum->info()->data_type()) - { - case DataType::F32: - { - execute_window_loop(window, [&](const Coordinates &) - { - const float32x4x4_t accum = vld4q_f32(reinterpret_cast(in0_out.ptr())); - const float32x4x4_t biases = vld4q_f32(reinterpret_cast(in1.ptr())); - const float32x4x4_t res = - { - { - vaddq_f32(accum.val[0], biases.val[0]), - vaddq_f32(accum.val[1], biases.val[1]), - vaddq_f32(accum.val[2], biases.val[2]), - vaddq_f32(accum.val[3], biases.val[3]) - } - }; - - vst4q_f32(reinterpret_cast(in0_out.ptr()), res); - }, - in0_out, in1); - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - execute_window_loop(window, [&](const Coordinates &) - { - const float16x8x2_t accum = vld2q_f16(reinterpret_cast(in0_out.ptr())); - const float16x8x2_t biases = vld2q_f16(reinterpret_cast(in1.ptr())); - const float16x8x2_t res = - { - { - vaddq_f16(accum.val[0], biases.val[0]), - vaddq_f16(accum.val[1], biases.val[1]) - } - }; - - vst2q_f16(reinterpret_cast(in0_out.ptr()), res); - }, - in0_out, in1); - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } -} diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp deleted file mode 100644 index cf8411c55f..0000000000 --- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - if(is_data_type_quantized_asymmetric(input0->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); - } - - ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions()); - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(DataLayoutDimension::HEIGHT) != output->dimension(DataLayoutDimension::HEIGHT)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(DataLayoutDimension::WIDTH) != output->dimension(DataLayoutDimension::WIDTH)); - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) -{ - const unsigned int num_elems_read_per_iteration = 16 / input0->element_size(); - - Window win = calculate_max_window(*input0, Steps(num_elems_read_per_iteration)); - - AccessWindowHorizontal input0_access(input0, 0, num_elems_read_per_iteration); - AccessWindowHorizontal input1_access(input1, 0, num_elems_read_per_iteration); - AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1)); - - bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); - - output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -template -void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out) -{ - ARM_COMPUTE_ERROR("Unsupported data types"); - ARM_COMPUTE_UNUSED(window_in); - ARM_COMPUTE_UNUSED(window_w); - ARM_COMPUTE_UNUSED(window_out); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, - const Window &window_w, - const Window &window_out) -{ - Iterator in(_input0, window_in); - Iterator in2(_input1, window_w); - Iterator out(_output, window_out); - - const int input_w = _input0->info()->dimension(0); - const int input_h = _input0->info()->dimension(1); - const int input_stride_x = _input0->info()->strides_in_bytes().x(); - const int weights_stride_x = _input1->info()->strides_in_bytes().x(); - const int weights_stride_y = _input1->info()->strides_in_bytes().y(); - const int output_stride_x = _output->info()->strides_in_bytes().x(); - - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; - auto output_ptr = reinterpret_cast<__fp16 *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); - - float16x8_t row_dot = vdupq_n_f16(0.f); - for(int i = 0; i < input_w; i += 8) - { - const auto input = vld1q_f16(reinterpret_cast(input_ptr + i * input_stride_x)); - const auto weights = vld1q_f16(reinterpret_cast(weights_ptr + i * weights_stride_x)); - row_dot = vaddq_f16(row_dot, vmulq_f16(input, weights)); - } - - auto temp = vadd_f16(vget_high_f16(row_dot), vget_low_f16(row_dot)); - temp = vpadd_f16(temp, temp); - temp = vpadd_f16(temp, temp); - - *output_ptr = vget_lane_f16(temp, 0); - }, - in, in2, out); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template <> -void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, - const Window &window_w, - const Window &window_out) -{ - Iterator in(_input0, window_in); - Iterator in2(_input1, window_w); - Iterator out(_output, window_out); - - const int input_w = _input0->info()->dimension(0); - const int input_h = _input0->info()->dimension(1); - const int input_stride_x = _input0->info()->strides_in_bytes().x(); - const int weights_stride_x = _input1->info()->strides_in_bytes().x(); - const int weights_stride_y = _input1->info()->strides_in_bytes().y(); - const int output_stride_x = _output->info()->strides_in_bytes().x(); - - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; - auto output_ptr = reinterpret_cast(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); - - float32x4_t row_dot = vdupq_n_f32(0.f); - for(int i = 0; i < input_w; i += 4) - { - const auto input = vld1q_f32(reinterpret_cast(input_ptr + i * input_stride_x)); - const auto weights = vld1q_f32(reinterpret_cast(weights_ptr + i * weights_stride_x)); - row_dot = vaddq_f32(row_dot, vmulq_f32(input, weights)); - } - - auto temp = vadd_f32(vget_high_f32(row_dot), vget_low_f32(row_dot)); - temp = vpadd_f32(temp, temp); - - *output_ptr = vget_lane_f32(temp, 0); - }, - in, in2, out); -} - -template <> -void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, - const Window &window_w, - const Window &window_out) -{ - Iterator in(_input0, window_in); - Iterator in2(_input1, window_w); - Iterator out(_output, window_out); - - const int input_offset = -_input0->info()->quantization_info().uniform().offset; - const int weights_offset = -_input1->info()->quantization_info().uniform().offset; - - const int input_w = _input0->info()->dimension(0); - const int input_h = _input0->info()->dimension(1); - const int input_stride_x = _input0->info()->strides_in_bytes().x(); - const int weights_stride_x = _input1->info()->strides_in_bytes().x(); - const int weights_stride_y = _input1->info()->strides_in_bytes().y(); - const int output_stride_x = _output->info()->strides_in_bytes().x(); - const int read_step = 16 / _input0->info()->element_size(); - - const int32x4_t v_input_offset = vdupq_n_s32(input_offset); - const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); - - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; - auto output_ptr = reinterpret_cast(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); - - int32x4_t row_dot = vdupq_n_s32(0); - for(int i = 0; i < input_w; i += read_step) - { - // Read values - const auto input = vld1q_u8(reinterpret_cast(input_ptr + i * input_stride_x)); - const auto weights = vld1q_u8(reinterpret_cast(weights_ptr + i * weights_stride_x)); - - // Add offsets - const int32x4x4_t input_s32 = - { - { - vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(input))))), - vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(input))))), - vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(input))))), - vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(input))))) - } - }; - const int32x4x4_t weights_s32 = - { - { - vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(weights))))), - vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(weights))))), - vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(weights))))), - vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(weights))))) - } - }; - - // Dot - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0])); - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1])); - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2])); - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3])); - } - - // Reduction - auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot)); - temp = vpadd_s32(temp, temp); - - *output_ptr = vget_lane_s32(temp, 0); - }, - in, in2, out); -} - -template <> -void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, - const Window &window_w, - const Window &window_out) -{ - Iterator in(_input0, window_in); - Iterator in2(_input1, window_w); - Iterator out(_output, window_out); - - const int input_offset = -_input0->info()->quantization_info().uniform().offset; - const int weights_offset = -_input1->info()->quantization_info().uniform().offset; - - const int input_w = _input0->info()->dimension(0); - const int input_h = _input0->info()->dimension(1); - const int input_stride_x = _input0->info()->strides_in_bytes().x(); - const int weights_stride_x = _input1->info()->strides_in_bytes().x(); - const int weights_stride_y = _input1->info()->strides_in_bytes().y(); - const int output_stride_x = _output->info()->strides_in_bytes().x(); - const int read_step = 16 / _input0->info()->element_size(); - - const int32x4_t v_input_offset = vdupq_n_s32(input_offset); - const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); - - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; - auto output_ptr = reinterpret_cast(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); - - int32x4_t row_dot = vdupq_n_s32(0); - for(int i = 0; i < input_w; i += read_step) - { - // Read values - const auto input = vld1q_s8(reinterpret_cast(input_ptr + i * input_stride_x)); - const auto weights = vld1q_s8(reinterpret_cast(weights_ptr + i * weights_stride_x)); - - // Add offsets - const int32x4x4_t input_s32 = - { - { - vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_low_s8(input)))), - vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_low_s8(input)))), - vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_high_s8(input)))), - vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_high_s8(input)))) - } - }; - const int32x4x4_t weights_s32 = - { - { - vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_low_s8(weights)))), - vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_low_s8(weights)))), - vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_high_s8(weights)))), - vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_high_s8(weights)))) - } - }; - - // Dot - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0])); - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1])); - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2])); - row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3])); - } - - // Reduction - auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot)); - temp = vpadd_s32(temp, temp); - - *output_ptr = vget_lane_s32(temp, 0); - }, - in, in2, out); -} - -NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel() - : _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0) -{ -} - -BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const -{ - return _border_size; -} - -void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); - - _input0 = input0; - _input1 = input1; - _output = output; - - // Set appropriate function to run - switch(input0->info()->data_type()) - { - case DataType::QASYMM8: - _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; - break; - case DataType::QASYMM8_SIGNED: - _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type"); - } - - // Configure kernel window - const unsigned int num_elems_read_per_iteration = 16 / _input0->info()->element_size(); - - const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0); - _border_size = BorderSize(0, border_x); - - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); -} - -Status NEGEMMMatrixVectorMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); - return Status{}; -} - -void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - Window window_slice = window.first_slice_window_3D(); - - Window window_in(window); - Window window_weights(window_slice); - Window window_out(window); - - // Setup input0 slice - window_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0))); - window_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1), 1)); - window_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1)); - - // Setup input1 and output slice. Their dimensions are increased in the kernel. - window_weights.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_weights.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_weights.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - (this->*_func)(window_in, window_weights, window_out); -} -} // namespace arm_compute -- cgit v1.2.1