diff options
author | Manuel Bottini <manuel.bottini@arm.com> | 2021-06-18 15:47:28 +0100 |
---|---|---|
committer | Manuel Bottini <manuel.bottini@arm.com> | 2021-07-08 14:47:38 +0000 |
commit | cfac51c779f9bf05e8b2d386fbfb4022767d1d30 (patch) | |
tree | 6ded148068c32bb1b2926946f59d0262d928b9ab /src/core | |
parent | 06ac6e438fc95aa7f8228be8217e0776d692b8e7 (diff) | |
download | ComputeLibrary-cfac51c779f9bf05e8b2d386fbfb4022767d1d30.tar.gz |
Port NEGEMMLowp Part 2
Details:
Extend NEConvertQuantizedSignednessKernel
Port NEGEMMInterleave4x4Kernel to CpuGemmInterleave4x4Kernel
Port NEGEMMTranspose1xWKernel to CpuGemmTranspose1xWKernel
Port NEGEMMLowpMatrixAReductionKernel to CpuGemmLowpMatrixAReductionKernel
Port NEGEMMLowpMatrixBReductionKernel to CpuGemmLowpMatrixBReductionKernel
Port NEGEMMLowpOffsetContributionOutputStageKernel to CpuGemmLowpOffsetContributionOutputStageKernel
Port NEGEMMLowpOffsetContributionKernel to CpuGemmLowpOffsetContributionKernel
Resolves: COMPMID-4403
Change-Id: I3227f052f25e7b41d073bbea1da8a881fcd78b8e
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5875
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'src/core')
20 files changed, 722 insertions, 822 deletions
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index 69c8d7bebc..cd09544d31 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -33,7 +33,6 @@ #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h" #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h" #include "src/core/NEON/kernels/NECol2ImKernel.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" #include "src/core/NEON/kernels/NECropKernel.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" @@ -41,10 +40,6 @@ #include "src/core/NEON/kernels/NEFFTScaleKernel.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" #include "src/core/NEON/kernels/NEGatherKernel.h" #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h" #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h" diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h deleted file mode 100644 index 67d5ca246e..0000000000 --- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H -#define ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */ -class NEConvertQuantizedSignednessKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEConvertQuantizedSignednessKernel"; - } - /** Default constructor */ - NEConvertQuantizedSignednessKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete; - /** Allow instances of this class to be moved */ - NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default; - /** Allow instances of this class to be moved */ - NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default; - /** Default destructor */ - ~NEConvertQuantizedSignednessKernel() = default; - /** Initialize the kernel's input, output. - * - * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Data types supported: opposite of @p input. - */ - void configure(const ITensor *input, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref NEConvertQuantizedSignednessKernel - * - * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor. Data types supported: opposite of @p input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - const ITensor *_input; - ITensor *_output; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H */ diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h deleted file mode 100644 index b9a1b5e840..0000000000 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H -#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H - -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Kernel to multiply matrices - * - * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel - * This kernel performs the following computation: - * - * -# Convert a values from int8 to int32 - * -# Convert b values from int8 to int32 - * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 - * - */ -class NEGEMMLowpMatrixMultiplyKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpMatrixMultiplyKernel"; - } - /** Constructor */ - NEGEMMLowpMatrixMultiplyKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default; - /** Default destructor */ - ~NEGEMMLowpMatrixMultiplyKernel() = default; - /** Initialise the kernel's input and output. - * - * The input matrices @p input0 and @p input1 must be the output of the kernels: cpu::kernels::CpuGemmInterleave4x4Kernel and @ref cpu::kernels::CpuGemmTranspose1xWKernel. These two - * kernels change the layout of the original matrices to be more cache-friendly. - * - * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - */ - void configure(const ITensor *input0, const ITensor *input1, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel - * - * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED - * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32 - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - const ITensor *_input0; - const ITensor *_input1; - ITensor *_output; - bool _slide_matrix_b; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H*/ diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h deleted file mode 100644 index 6908f37aad..0000000000 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H -#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H - -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel. - * - * The computation is performed in-place - * - * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), - * and adds to it the offset contribution of matrix A and matrix B in-place. - * - * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8. - * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8. - * - * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is: - * - * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift - * - * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is: - * - * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * and mm_result'[i][k] = mm_result[i][k] + - * (vector_sum_col[k] * a_offset) + - * (vector_sum_row[i] * b_offset) + - * (a_offset * b_offset * k) - */ - -class NEGEMMLowpOffsetContributionOutputStageKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpOffsetContributionOutputStageKernel"; - } - /** Constructor */ - NEGEMMLowpOffsetContributionOutputStageKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpOffsetContributionOutputStageKernel(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpOffsetContributionOutputStageKernel &operator=(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMLowpOffsetContributionOutputStageKernel(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMLowpOffsetContributionOutputStageKernel &operator=(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Default destructor */ - ~NEGEMMLowpOffsetContributionOutputStageKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. - * @param[out] output Output tensor containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. - */ - void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset, - GEMMLowpOutputStageInfo output_stage); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionOutputStageKernel - * - * @param[in] mm_result Input tensor info containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 - * @param[in] vector_sum_col Tensor info for the input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Tensor info for the input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. - * @param[in] output Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, - int32_t b_offset, - GEMMLowpOutputStageInfo output_stage); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - /** Function to use for the particular tensors passed to configure() */ - const ITensor *_vector_sum_col; - const ITensor *_vector_sum_row; - const ITensor *_bias; - const ITensor *_mm_result; - ITensor *_output; - int32_t _a_offset; - int32_t _b_offset; - int32_t _k_offset; - bool _slide_vector_sum_col; - GEMMLowpOutputStageInfo _output_stage; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */ diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h deleted file mode 100644 index 9be618d656..0000000000 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H -#define ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H - -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; -struct GEMMLowpReductionKernelInfo; - -/** Common interface for all reduction kernels */ -class INEGEMMLowpReductionKernel : public INEKernel -{ -public: - /** Constructor */ - INEGEMMLowpReductionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete; - /** Allow instances of this class to be moved */ - INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default; - /** Allow instances of this class to be moved */ - INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default; - /** Default destructor */ - virtual ~INEGEMMLowpReductionKernel() = default; - - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0; - -protected: - const ITensor *_input; - ITensor *_output; - int32_t _k; - int32_t _scalar; - bool _mul_by_scalar; -}; - -/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpMatrixAReductionKernel"; - } - /** Default constructor */ - NEGEMMLowpMatrixAReductionKernel() = default; - /** Prevent instances of this class from being copied */ - NEGEMMLowpMatrixAReductionKernel(const NEGEMMLowpMatrixAReductionKernel &) = delete; - /** Prevent instances of this class from being copied */ - NEGEMMLowpMatrixAReductionKernel &operator=(const NEGEMMLowpMatrixAReductionKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMLowpMatrixAReductionKernel(NEGEMMLowpMatrixAReductionKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMLowpMatrixAReductionKernel &operator=(NEGEMMLowpMatrixAReductionKernel &&) = default; - /** Default destructor */ - ~NEGEMMLowpMatrixAReductionKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k (num_mtx_a_cols) Number of matrix A columns - * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 - * - scalar Scalar value to multiply each reduced row by. - * - mul_byscalar True if each reduced column must be multiplied by a scalar value. - */ - void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k (num_mtx_a_cols) Number of matrix A columns - * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 - * - scalar Scalar value to multiply each reduced row by. - * - mul_byscalar True if each reduced column must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - /** Execution of the reduction kernel specialized on the input type - * - * @param[in] window Execution window - */ - template <typename T> - void run_internal(const Window &window); -}; - -/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpMatrixBReductionKernel"; - } - /** Default constructor */ - NEGEMMLowpMatrixBReductionKernel() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpMatrixBReductionKernel(const NEGEMMLowpMatrixBReductionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpMatrixBReductionKernel &operator=(const NEGEMMLowpMatrixBReductionKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMLowpMatrixBReductionKernel(NEGEMMLowpMatrixBReductionKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMLowpMatrixBReductionKernel &operator=(NEGEMMLowpMatrixBReductionKernel &&) = default; - /** Default destructor */ - ~NEGEMMLowpMatrixBReductionKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k (num_mtx_b_rows) Number of matrix B rows. - * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. - * - scalar Scalar value to multiply each reduced row by. - * - mul_byscalar True if each reduced row must be multiplied by a scalar value. - */ - void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k (num_mtx_b_rows) Number of matrix B rows. - * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. - * - scalar Scalar value to multiply each reduced row by. - * - mul_byscalar True if each reduced row must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - /** Execution of the reduction kernel specialized on the input type - * - * @param[in] window Execution window - * @param[in] info Thread-related information - */ - template <typename T> - void run_internal(const Window &window, const ThreadInfo &info); -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H */ diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp index 1f2170f42a..26cbb48deb 100644 --- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp +++ b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" +#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -35,76 +35,73 @@ namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); // Validate output if initialized - if(output->total_size() != 0) + if(dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) { // Output auto inizialitation if not yet initialized { - const bool is_input_signed = input->data_type() == DataType::QASYMM8_SIGNED; + const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo qinfo = input->quantization_info().uniform(); + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); const int offset_correction = is_input_signed ? -128 : 128; const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); - auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); + auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); } - return std::make_pair(Status{}, calculate_max_window(*output)); + return std::make_pair(Status{}, calculate_max_window(*dst)); } } // namespace -NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel() - : _input(nullptr), _output(nullptr) -{ -} - -void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output) +void CpuConvertQuantizedSignednessKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - _input = input; - _output = output; - - std::pair<Status, Window> win_config = validate_and_configure_window(input->info(), output->info()); + std::pair<Status, Window> win_config = validate_and_configure_window(src, dst); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + ICpuKernel::configure(win_config.second); } -Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output) +Status CpuConvertQuantizedSignednessKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); return Status{}; } -void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info) +void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(_input, win_collapsed); - Iterator output(_output, win_collapsed); + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); const int window_step_x = 16; const auto window_start_x = static_cast<int>(window.x().start()); @@ -135,4 +132,11 @@ void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadI }, input, output); } + +const char *CpuConvertQuantizedSignednessKernel::name() const +{ + return "CpuConvertQuantizedSignednessKernel"; +} +} // namespace kernels +} // namespace cpu } // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h new file mode 100644 index 0000000000..2a8f6c364d --- /dev/null +++ b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H +#define ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */ +class CpuConvertQuantizedSignednessKernel : public ICpuKernel +{ +public: + CpuConvertQuantizedSignednessKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertQuantizedSignednessKernel); + /** Initialize the kernel input and output info. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data types supported: opposite of @p src. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuConvertQuantizedSignednessKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h index 8f1a54314a..0c55886d8d 100644 --- a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h +++ b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h @@ -56,7 +56,6 @@ class CpuGemmInterleave4x4Kernel : public ICpuKernel { public: CpuGemmInterleave4x4Kernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmInterleave4x4Kernel); /** Initialise the kernel's src and dst. * * @param[in] src Input tensor info. Data types supported: All diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp index 6bcf59ee96..35e542faa4 100644 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -36,10 +36,12 @@ #include <arm_neon.h> -using namespace arm_compute; - namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) @@ -860,19 +862,16 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int }, ina, inb, out); } -} // namespace -namespace -{ -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - TensorShape in0_shape = input0->tensor_shape(); - TensorShape in1_shape = input1->tensor_shape(); - TensorShape out_shape = output->tensor_shape(); + TensorShape in0_shape = src0->tensor_shape(); + TensorShape in1_shape = src1->tensor_shape(); + TensorShape out_shape = dst->tensor_shape(); // Check vector-by-matrix case if(out_shape[1] == 1) @@ -894,63 +893,58 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, } } // namespace -NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true) +void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { -} + ARM_COMPUTE_UNUSED(src0); + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst)); -void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); - - TensorShape in1_shape = input1->info()->tensor_shape(); + TensorShape in1_shape = src1->tensor_shape(); in1_shape.collapse(2); - _input0 = input0; - _input1 = input1; - _output = output; _slide_matrix_b = in1_shape[2] != 1; constexpr unsigned int num_elems_processed_per_iteration_x = 16; constexpr unsigned int num_elems_processed_per_iteration_y = 4; Window win; - // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication - if((output->info()->dimension(1) == 1)) + if((dst->dimension(1) == 1)) { // Configure kernel window - win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); } else { - win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); } - INEKernel::configure(win); + ICpuKernel::configure(win); } -Status NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); - + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst)); return Status{}; } -void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info) +void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path - if((_output->info()->dimension(1) == 1)) + if((dst->info()->dimension(1) == 1)) { - const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0)); - const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0)); - const auto width_out = static_cast<int>(_output->info()->dimension(0)); - const auto in_b_stride = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type())); + const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0)); + const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0)); + const auto width_out = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); // The implementation computes 16 elements per iteration const int window_start_x = 16 * info.thread_id; @@ -969,18 +963,18 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(_input1->info()->num_dimensions() >= 3) + if(src1->info()->num_dimensions() >= 3) { win_b = window; } win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - Iterator ina(_input0, win_a); - Iterator inb(_input1, win_b); - Iterator out(_output, win_out); + Iterator ina(src0, win_a); + Iterator inb(src1, win_b); + Iterator out(dst, win_out); - switch(_input0->info()->data_type()) + switch(src0->info()->data_type()) { case DataType::S8: case DataType::QASYMM8_SIGNED: @@ -1003,8 +997,8 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo } else { - const size_t in_b_stride = _input1->info()->strides_in_bytes()[1]; - const int width_b = _input1->info()->dimension(0); + const size_t in_b_stride = src1->info()->strides_in_bytes()[1]; + const int width_b = src1->info()->dimension(0); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix Window win_a(window); @@ -1023,22 +1017,22 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); // The step x and step y for the output matrix has been already set using in configure() - Iterator ina(_input0, win_a); - Iterator inb(_input1, win_b); - Iterator out(_output, window); + Iterator ina(src0, win_a); + Iterator inb(src1, win_b); + Iterator out(dst, window); - switch(_input0->info()->data_type()) + switch(src0->info()->data_type()) { case DataType::S8: case DataType::QASYMM8_SIGNED: { - matrix_multiply_s8(ina, inb, out, width_b, *_output->info(), window); + matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window); break; } case DataType::U8: case DataType::QASYMM8: { - matrix_multiply_u8(ina, inb, out, width_b, *_output->info(), window); + matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window); break; } default: @@ -1049,4 +1043,11 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo } } } -} // namespace arm_compute + +const char *CpuGemmLowpMatrixMultiplyKernel::name() const +{ + return "CpuGemmLowpMatrixMultiplyKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h new file mode 100644 index 0000000000..77d8741b19 --- /dev/null +++ b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to multiply matrices + * + * @note @ref CpuGemmLowpMatrixMultiplyKernel low precision matrix product kernel + * This kernel performs the following computation: + * + * -# Convert a values from int8 to int32 + * -# Convert b values from int8 to int32 + * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 + * + */ +class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixMultiplyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyKernel); + /** Initialise the kernel's input and output. + * + * The input matrices @p src0 and @p src1 must be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel. These two + * kernels change the layout of the original matrices to be more cache-friendly. + * + * @param[in] src0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED + * @param[in] src1 Input tensor info containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output tensor info to store the result of matrix multiplication. Data type supported: S32 + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixMultiplyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + bool _slide_matrix_b{ true }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H*/ diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp index dfbfbd6fab..270abc8bbd 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" +#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" @@ -32,68 +32,80 @@ namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { -Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(output->total_size() > 0) + if(dst->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); } return Status{}; } -Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(output->total_size() > 0) + if(dst->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); } return Status{}; } } // namespace -INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() - : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false) -{ -} - -void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) { // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); - ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); - _input = mtx_a; - _output = vector_sum_row; + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info)); _k = info.k; _scalar = info.scalar; _mul_by_scalar = info.mul_by_scalar; - // Output auto initialization if not yet initialized - auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32); + switch(src->data_type()) + { + case DataType::QASYMM8: + _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } - Window win = calculate_max_window(*_output->info(), Steps(1)); + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32); - INEKernel::configure(win); + Window win = calculate_max_window(*dst, Steps(1)); + ICpuKernel::configure(win); } -Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info)); return Status{}; } template <typename T> -void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window) +void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window) { // Intermediate and final accumulator types using TIAcc = wrapper::traits::promote_t<T>; @@ -106,15 +118,15 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w win_input.set(Window::DimY, Window::Dimension(0, 0, 0)); win_input.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(_input, win_input); - Iterator out(_output, collapsed_window); + Iterator in(src, win_input); + Iterator out(dst, collapsed_window); execute_window_loop(collapsed_window, [&](const Coordinates & id) { auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}); TAcc sum_row = 0; - const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); + const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); @@ -160,36 +172,28 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w in, out); } -void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) +void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - switch(_input->info()->data_type()) - { - case DataType::QASYMM8: - run_internal<uint8_t>(window); - break; - case DataType::QASYMM8_SIGNED: - case DataType::QSYMM8: - case DataType::QSYMM8_PER_CHANNEL: - run_internal<int8_t>(window); - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type"); - } + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, dst, window); } -void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +const char *CpuGemmLowpMatrixAReductionKernel::name() const { - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); - ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + return "CpuGemmLowpMatrixAReductionKernel"; +} - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); +void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info)); - _input = mtx_b; - _output = vector_sum_col; _k = info.k; _scalar = info.scalar; _mul_by_scalar = info.mul_by_scalar; @@ -197,24 +201,36 @@ void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor * // Configure kernel window constexpr unsigned int num_elems_processed_per_iteration = 16; + switch(src->data_type()) + { + case DataType::QASYMM8: + _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + // Output auto initialization if not yet initialized - auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32); + auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32); // Configure kernel window - Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration)); - INEKernel::configure(win); + Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration)); + ICpuKernel::configure(win); } -Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); - + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info)); return Status{}; } template <typename T> -void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info) +void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info) { // Intermediate and final accumulator types using TIAcc = wrapper::traits::promote_t<T>; @@ -223,8 +239,8 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{}); - const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0)); - const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]); + const auto width_matrix_b = static_cast<int>(src->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(src->info()->strides_in_bytes()[1]); // The implementation computes 16 elements per iteration const int window_start_x = 16 * info.thread_id; @@ -239,8 +255,8 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator inb(_input, win_in); - Iterator out(_output, win_out); + Iterator inb(src, win_in); + Iterator out(dst, win_out); execute_window_loop(win_out, [&](const Coordinates & id) { @@ -258,7 +274,7 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}) }; - const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]); + const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); @@ -359,24 +375,22 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const inb, out); } -void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) +void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - switch(_input->info()->data_type()) - { - case DataType::QASYMM8: - run_internal<uint8_t>(window, info); - break; - case DataType::QASYMM8_SIGNED: - case DataType::QSYMM8: - case DataType::QSYMM8_PER_CHANNEL: - run_internal<int8_t>(window, info); - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type"); - } + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, dst, window, info); +} + +const char *CpuGemmLowpMatrixBReductionKernel::name() const +{ + return "CpuGemmLowpMatrixBReductionKernel"; } -} // namespace arm_compute +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h new file mode 100644 index 0000000000..106980fc0b --- /dev/null +++ b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declarations +struct GEMMLowpReductionKernelInfo; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixAReductionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixAReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_a_cols) Number of matrix A columns + * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced column must be multiplied by a scalar value. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixAReductionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] src Input tensor + * @param[in] dst Output tensor + * @param[in] window Execution window + */ + template <typename T> + void run_internal(const ITensor *src, ITensor *dst, const Window &window); + + /** Common signature for all reduction functions + * + * @param[in] src Input tensor + * @param[out] dst Output tensor + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + + CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr }; + int32_t _k{ 0 }; + int32_t _scalar{ 0 }; + bool _mul_by_scalar{ false }; +}; + +/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixBReductionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixBReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_b_rows) Number of matrix B rows. + * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced row must be multiplied by a scalar value. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixBReductionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] src Input tensor + * @param[in] dst Output tensor + * @param[in] window Execution window + * @param[in] info Thread-related information + */ + template <typename T> + void run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); + + /** Common signature for all reduction functions + * + * @param[in] src Input tensor + * @param[out] dst Output tensor + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); + + CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr }; + int32_t _k{ 0 }; + int32_t _scalar{ 0 }; + bool _mul_by_scalar{ false }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp index 867beca0ac..9b1bf08955 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -38,6 +38,10 @@ namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, @@ -354,26 +358,16 @@ void run_offset_contribution(const Window &window, } } // namespace -NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) -{ -} - -void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) { // Perform validate step + ARM_COMPUTE_UNUSED(vector_sum_row); ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT - a_offset, b_offset)); // NOLINT - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * k; + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) @@ -381,33 +375,43 @@ void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITe // Check if vector_sum_col_shape should be slidden or not // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1; + _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1; } // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps()); - INEKernel::configure(win); + Window win = calculate_max_window(*mm_result, Steps()); + ICpuKernel::configure(win); } -Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) +Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); - return Status{}; } -void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info) +void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto mm_result = tensors.get_tensor(TensorType::ACL_DST); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = _vector_sum_row != nullptr - && _mm_result->info()->num_dimensions() > 1 - && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->info()->num_dimensions() > 1 + && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d); + run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d); +} + +const char *CpuGemmLowpOffsetContributionKernel::name() const +{ + return "CpuGemmLowpOffsetContributionKernel"; } -} // namespace arm_compute +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h index f71929fe9e..f23a46cde7 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h +++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h @@ -21,18 +21,21 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H -#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H -#include "src/core/NEON/INEKernel.h" +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" namespace arm_compute { -class ITensor; - -/** Kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place +namespace cpu +{ +namespace kernels +{ +/** Kernel used to add the offset contribution after @ref CpuGemmLowpMatrixMultiplyKernel. The computation is performed in-place * - * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), * and adds to it the offset contribution of matrix A and matrix B in-place. * * The final result is: @@ -43,28 +46,15 @@ class ITensor; * (a_offset * b_offset * k) * */ -class NEGEMMLowpOffsetContributionKernel : public INEKernel +class CpuGemmLowpOffsetContributionKernel : public ICpuKernel { public: - const char *name() const override - { - return "NEGEMMLowpOffsetContributionKernel"; - } - /** Constructor */ - NEGEMMLowpOffsetContributionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default; - /** Default destructor */ - ~NEGEMMLowpOffsetContributionKernel() = default; + /** Default constructor */ + CpuGemmLowpOffsetContributionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionKernel); /** Initialise the kernel's input and output. * - * @param[in, out] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in, out] mm_result Input tensor containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. @@ -73,33 +63,26 @@ public: * @param[in] a_offset Offset to be added to each element of the matrix A. * @param[in] b_offset Offset to be added to each element of the matrix B. */ - void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel + void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration * - * @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. + * Similar to CpuGemmLowpOffsetContributionKernel::configure() * * @return a status */ static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; private: - const ITensor *_vector_sum_col; - const ITensor *_vector_sum_row; - ITensor *_mm_result; - int32_t _a_offset; - int32_t _b_offset; - int32_t _k_offset; - bool _slide_vector_sum_col; + int32_t _a_offset{ 0 }; + int32_t _b_offset{ 0 }; + int32_t _k_offset{ 0 }; + bool _slide_vector_sum_col{ true }; }; +} // namespace kernels +} // namespace cpu } // namespace arm_compute - -#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */ +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H */ diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp index dfed7f0bb8..332ce6f013 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -37,12 +37,13 @@ #include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> -#include <cstddef> -#include <cstdint> -#include <map> namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x) @@ -836,53 +837,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto return Status{}; } - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *output) -{ - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8)); - - // Configure kernel window - Window win = calculate_max_window(*mm_result, Steps()); - - // Note: This kernel performs 16 elements per iteration. - // However, since we use a left-over for loop, we cannot have any read or write out of memory - // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped - - return std::make_pair(Status{}, win); -} } // namespace -NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true), - _output_stage(GEMMLowpOutputStageInfo()) - -{ -} - -void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col, - const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, - int32_t k, int32_t a_offset, int32_t b_offset, - GEMMLowpOutputStageInfo output_stage) +void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, + int32_t k, int32_t a_offset, int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) { + ARM_COMPUTE_UNUSED(vector_sum_row, bias); // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT - bias != nullptr ? bias->info() : nullptr, // NOLINT - output->info(), a_offset, b_offset, output_stage)); // NOLINT - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _bias = bias; - _mm_result = mm_result; - _output = output; - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * k; - _output_stage = output_stage; + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; + _output_stage = output_stage; // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) @@ -890,40 +860,51 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_ // Check if vector_sum_col_shape should be slidden or not // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1; + _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1; } + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, mm_result->clone()->set_data_type(DataType::QASYMM8)); + // Configure kernel window - auto win_config = validate_and_configure_window(mm_result->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + Window win = calculate_max_window(*mm_result, Steps()); + + // Note: This kernel performs 16 elements per iteration. + // However, since we use a left-over for loop, we cannot have any read or write out of memory + // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped + ICpuKernel::configure(win); } -Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, - const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) +Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, + int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), output->clone().get()).first); return Status{}; } -void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, const ThreadInfo &info) +void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto mm_result = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_3); + auto dst = tensors.get_tensor(TensorType::ACL_DST); PixelValue type_min{}; PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(_output->info()->data_type()); + std::tie(type_min, type_max) = get_min_max(dst->info()->data_type()); int32_t type_min_int = type_min.get<int32_t>(); int32_t type_max_int = type_max.get<int32_t>(); - const bool reinterpret_as_3d = _vector_sum_row != nullptr - && _mm_result->info()->num_dimensions() > 1 - && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->info()->num_dimensions() > 1 + && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); @@ -931,29 +912,35 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, co const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; // Check if symmetric per-channel execution - const bool is_signed = _output->info()->data_type() == DataType::QASYMM8_SIGNED; + const bool is_signed = dst->info()->data_type() == DataType::QASYMM8_SIGNED; // Check if symmetric per-channel execution const bool is_symm = _output_stage.is_quantized_per_channel; if(is_symm) { - run_offset_contribution_output_stage_symm(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } else { if(is_signed) { - run_offset_contribution_output_stage<int8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } else { - run_offset_contribution_output_stage<uint8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } } } +const char *CpuGemmLowpOffsetContributionOutputStageKernel::name() const +{ + return "CpuGemmLowpOffsetContributionOutputStageKernel"; +} +} // namespace kernels +} // namespace cpu } // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h new file mode 100644 index 0000000000..404f2c9496 --- /dev/null +++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel used to add the offset contribution and perform the output stage after @ref CpuGemmLowpMatrixMultiplyKernel. + * + * The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8. + * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8. + * + * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is: + * + * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift + * + * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is: + * + * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift + * + * where FixedPointMul(x, y) is the nearest integer to the following + * mathematical expression, evaluated without overflow or intermediate rounding: + * + * (x * y) / 2^31 + * + * and mm_result'[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + */ + +class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpOffsetContributionOutputStageKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionOutputStageKernel); + /** Initialise the kernel inputs and output. + * + * @param[in] mm_result Input tensor info containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector tensor info of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector tensor info of sums of all the entries in each row of matrix A. + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. + * @param[out] dst Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. + */ + void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpOffsetContributionOutputStageKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Function to use for the particular tensors passed to configure() */ + int32_t _a_offset{ 0 }; + int32_t _b_offset{ 0 }; + int32_t _k_offset{ 0 }; + bool _slide_vector_sum_col{ true }; + GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h index f3cdbdc610..ca5e1b40fc 100644 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h +++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h @@ -38,7 +38,7 @@ namespace kernels { /** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED * - * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. * The following computations will be performed by the kernel: * * -# Add offset terms to final result diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h index 7a1197d2cf..e360e65bae 100644 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h +++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -38,7 +38,7 @@ namespace kernels { /** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 * - * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value. + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value. * The following computations will be performed by the kernel: * * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h index 9ebb529990..9c213abdf7 100644 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h +++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h @@ -38,7 +38,7 @@ namespace kernels { /** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED * - * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value. + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value. * The following computations will be performed by the kernel: * * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h index 312cad971b..13b30f3427 100644 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h +++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -38,7 +38,7 @@ namespace kernels { /** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 * - * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. * The following computations will be performed by the kernel: * * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier |