From e75a02b60736f37c34388c23c0ccee230f65da59 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Wed, 8 Nov 2017 12:24:09 +0000 Subject: COMPMID-675 - Reworked NEGEMMLowp interface/function The new interface makes NEGEMMLowp able to work with ASYMM8 data types. Implemented 2 new functions: - NEGEMMLowpMatrixMultiplyCore - NEGEMMLowpOutputStage These functions should make the integration in android NN doable For more information about GEMMLowp: https://github.com/google/gemmlowp/blob/master/doc/low-precision.md Change-Id: Ie2c775f45234f68ca53dba644b3a912b997fd890 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95504 Tested-by: Kaizen Reviewed-by: Pablo Tello --- arm_compute/core/NEON/NEKernels.h | 3 +- .../core/NEON/kernels/NEGEMMLowpFinalizeKernel.h | 103 ----- .../NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h | 4 +- .../kernels/NEGEMMLowpOffsetContributionKernel.h | 79 ++++ ...NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h | 78 ++++ .../core/NEON/kernels/NEGEMMLowpReductionKernel.h | 27 +- arm_compute/runtime/NEON/NEFunctions.h | 2 +- arm_compute/runtime/NEON/functions/NEGEMMLowp.h | 94 ---- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 33 +- .../runtime/NEON/functions/NEGEMMLowpOutputStage.h | 69 +++ docs/00_introduction.dox | 2 +- .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 3 +- src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp | 509 --------------------- .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp | 128 +++--- .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 338 ++++++++++++++ ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 141 ++++++ .../NEON/kernels/NEGEMMLowpReductionKernel.cpp | 176 +++---- src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 3 +- src/runtime/NEON/functions/NEGEMMLowp.cpp | 134 ------ .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 84 +++- .../NEON/functions/NEGEMMLowpOutputStage.cpp | 37 ++ tests/benchmark/NEON/GEMMLowp.cpp | 1 - tests/datasets/GEMMLowpDataset.h | 36 +- tests/datasets/LargeGEMMLowpDataset.h | 12 +- tests/datasets/SmallGEMMLowpDataset.h | 12 +- tests/validation/CPP/GEMMLowp.cpp | 67 +-- tests/validation/CPP/GEMMLowp.h | 6 +- tests/validation/NEON/GEMMLowp.cpp | 93 +++- tests/validation/fixtures/GEMMLowpFixture.h | 94 ++-- 29 files changed, 1171 insertions(+), 1197 deletions(-) delete mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h delete mode 100644 arm_compute/runtime/NEON/functions/NEGEMMLowp.h create mode 100644 arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h delete mode 100644 src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp delete mode 100644 src/runtime/NEON/functions/NEGEMMLowp.cpp create mode 100644 src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 8dedf38b3e..d78cec2a62 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -61,8 +61,9 @@ #include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h deleted file mode 100644 index 8908fabc1e..0000000000 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWPFINALIZEKERNEL_H__ -#define __ARM_COMPUTE_NEGEMMLOWPFINALIZEKERNEL_H__ - -#include "arm_compute/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/* NEON kernel used to finalize the GEMMLowp result - * - * This kernel performs the following computations: - * - * -# Add offset terms to final result - * -# Multiply each entry of result and round to nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. - */ -class NEGEMMLowpFinalizeKernel : public INEKernel -{ -public: - /** Constructor */ - NEGEMMLowpFinalizeKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpFinalizeKernel(const NEGEMMLowpFinalizeKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEGEMMLowpFinalizeKernel &operator=(const NEGEMMLowpFinalizeKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMLowpFinalizeKernel(NEGEMMLowpFinalizeKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMLowpFinalizeKernel &operator=(NEGEMMLowpFinalizeKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @note The input row-vectors @p vector_sum_col and @p vector_sum_row must be the output of @ref NEGEMMLowpMatrixBReductionKernel and @ref NEGEMMLowpMatrixAReductionKernel kernels. - * These 2 vectors are needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - * - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of input1. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of input0. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p vector_sum_col - * @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: same as @p vector_sum_col - * @param[out] output Output tensor containing the result of GEMMLowP. Data type supported: S8 - * @param[in] num_mtx_a_cols Number of matrix A columns - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] c_offset Offset to be added to each element of the output matrix - * @param[in] c_mult_int Value to be multiplied to each entry of the result. - * @param[in] shift Number of bits to shift right the result. - */ - void configure(const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *mm_result, ITensor *output, int32_t num_mtx_a_cols, int32_t a_offset, int32_t b_offset, int32_t c_offset, - int32_t c_mult_int, int32_t shift); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - /** Template function to run the finalize kernel - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void finalize(const Window &window); - using FinalizeFunctionPtr = void (NEGEMMLowpFinalizeKernel::*)(const Window &window); - - FinalizeFunctionPtr _func; - const ITensor *_vector_sum_col; - const ITensor *_vector_sum_row; - const ITensor *_mm_result; - ITensor *_output; - int32_t _a_offset; - int32_t _b_offset; - int32_t _c_offset; - int32_t _k_offset; - int32_t _c_mult_int; - int32_t _shift; - bool _slide_vector_sum_col; -}; -} // namespace arm_compute - -#endif /* __ARM_COMPUTE_NEGEMMLOWPFINALIZEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h index f145eb6ca3..e9bfe4ea07 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h @@ -58,8 +58,8 @@ public: * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two * kernels change the layout of the original matrices to be more cache-friendly. * - * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: S8 - * @param[in] input1 Input tensor containing the transposed Matrix B. Data type supported: same as @p input0 + * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: ASYMM8 + * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0 * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 */ void configure(const ITensor *input0, const ITensor *input1, ITensor *output); diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h new file mode 100644 index 0000000000..04b84339b0 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/* NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + */ +class NEGEMMLowpOffsetContributionKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMLowpOffsetContributionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in, out] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_vector_sum_col; + const ITensor *_vector_sum_row; + ITensor *_mm_result; + int32_t _a_offset; + int32_t _b_offset; + int32_t _k_offset; + bool _slide_vector_sum_col; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h new file mode 100644 index 0000000000..65f1042b9c --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ +#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/* NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result and round to nearest integer + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + */ +class NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + */ + void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; + int32_t _result_offset; + int32_t _result_mult_int; + int32_t _result_shift; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h index a069969681..6eee54a9f0 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h @@ -45,10 +45,9 @@ public: /** Allow instances of this class to be moved */ INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default; -public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor containing the interleaved or transposed matrix. Data type supported: S8 + * @param[in] input Input tensor. Data type supported: S8 * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 * @param[in] k Number of matrix A columns (or matrix B rows) * @param[in] is_reshaped True if the input tensor has been reshaped @@ -72,14 +71,12 @@ class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel public: /** Initialise the kernel's input and output. * - * @note The input matrix @p mtx_a_interleaved4x4 must be the output of @ref NEGEMMInterleave4x4Kernel. - * - * @param[in] mtx_a_interleaved4x4 Input tensor containing the interleaved Matrix A. Data type supported: U8 - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a_interleaved4x4. Data type supported: S32 - * @param[in] num_mtx_a_cols Number of matrix A columns - * @param[in] is_interleaved4x4 True if the input tensor is interleaved4x4 + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] num_mtx_a_cols Number of matrix A columns + * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 */ - void configure(const ITensor *mtx_a_interleaved4x4, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override; + void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override; // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -95,14 +92,12 @@ class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel public: /** Initialise the kernel's input and output. * - * @note The input matrix @p mtx_b_transposed1xW must be the output of @ref NEGEMMTranspose1xWKernel kernel. - * - * @param[in] mtx_b_transposed1xW Input tensor containing the transposed Matrix B. Data type supported: Data type supported: U8 - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b_transposed1xW. Data type supported: S32 - * @param[in] num_mtx_b_rows Number of matrix B rows - * @param[in] is_transposed1xW True if the input tensor is transposed 1xW + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] num_mtx_b_rows Number of matrix B rows + * @param[in] is_transposed1xW True if the input tensor is transposed 1xW */ - void configure(const ITensor *mtx_b_transposed1xW, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override; + void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override; // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h index 563ade288a..118603b20b 100644 --- a/arm_compute/runtime/NEON/NEFunctions.h +++ b/arm_compute/runtime/NEON/NEFunctions.h @@ -60,8 +60,8 @@ #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h" #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h" #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h deleted file mode 100644 index 59c919e161..0000000000 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWP_H__ -#define __ARM_COMPUTE_NEGEMMLOWP_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/Tensor.h" - -#include - -namespace arm_compute -{ -class ITensor; - -/** Basic function to execute GEMMLowp on NEON. This function calls the following NEON kernels/function: - * - * -# @ref NEGEMMLowpMatrixAReductionKernel - * -# @ref NEGEMMLowpMatrixBReductionKernel - * -# @ref NEGEMMLowpMatrixMultiplyCore - * -# @ref NEGEMMLowpFinalizeKernel - * -*/ -class NEGEMMLowp : public IFunction -{ -public: - /** Constructor */ - NEGEMMLowp(std::shared_ptr memory_manager = nullptr); - /** Initialise the kernel's inputs, output - * - * @note GEMM_LOWP: low precision GEMM kernel - * This kernel performs the following computations: - * - * -# Convert a values from int8 to int32 and add a_offset to each of them. - * -# Convert b values from int8 to int32 and add b_offset to each of them. - * -# Compute the int32 matrix product of the resulting a * b. - * -# Add output_offset to each entry of the result. - * -# Multiply each entry of the result and round to the nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to int8. - * - * @param[in] a First input tensor (Matrix A). Data type supported: S8. - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a - * @param[out] output Output tensor. Data type supported: same as @p a. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] c_offset Offset to be added to each element of the output matrix - * @param[in] output_mult_int Value to be multiplied to each element of the output matrix - * @param[in] shift Number of bits to shift right the result. - */ - void configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t output_mult_int, int32_t shift); - - // Inherited methods overridden: - void run() override; - -private: - MemoryGroup _memory_group; - NEGEMMLowpMatrixMultiplyCore _mm_func; - NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; - NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; - NEGEMMLowpFinalizeKernel _finalize_kernel; - Tensor _vector_sum_col; - Tensor _vector_sum_row; - Tensor _mm_output; - int32_t _a_offset; - int32_t _b_offset; -}; -} -#endif /*__ARM_COMPUTE_NEGEMMLOWP_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index c81a432295..0c441df4b9 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -25,6 +25,8 @@ #define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ #include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -41,11 +43,13 @@ class ITensor; * -# @ref NEGEMMInterleave4x4Kernel * -# @ref NEGEMMTranspose1xWKernel * -# @ref NEGEMMLowpMatrixMultiplyKernel + * -# @ref NEGEMMLowpOffsetContributionKernel * * otherwise if the DOT product instruction is available: * * -# @ref NEGEMMInterleaveBlockedKernel * -# @ref NEGEMMLowpAArch64V8P4Kernel + * -# @ref NEGEMMLowpOffsetContributionKernel * */ class NEGEMMLowpMatrixMultiplyCore : public IFunction @@ -58,11 +62,11 @@ public: * @note GEMM_LOWP: low precision GEMM kernel * This kernel performs the following computations: * - * -# Convert a values from uint8 to int32 - * -# Convert b values from uint8 to int32 - * -# Compute the int32 matrix product of the resulting a * b. + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. * - * @param[in] a First input tensor (Matrix A). Data type supported: U8. + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a * @param[out] output Output tensor. Data type supported: Data type supported: S32 */ @@ -72,13 +76,20 @@ public: void run() override; private: - MemoryGroup _memory_group; - std::unique_ptr _mm_kernel; - std::unique_ptr _mtx_a_reshape_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _workspace; + MemoryGroup _memory_group; + std::unique_ptr _mm_kernel; + std::unique_ptr _mtx_a_reshape_kernel; + std::unique_ptr _mtx_b_reshape_kernel; + NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; + NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; + NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; + Tensor _vector_sum_col; + Tensor _vector_sum_row; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _workspace; + int32_t _a_offset; + int32_t _b_offset; }; } #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h new file mode 100644 index 0000000000..8557ef42e1 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ +#define __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +/** This file contains all available output stages for GEMMLowp on NEON. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final ASYMM8 value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8Scale on NEON. + * + * NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift + * The final result is: + * + * ((input[i][k] + result_offset) * result_mult_int + rounding) >> result_shift + * + * where rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)) + * + * This function calls the following NEON kernels: + * + * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * +*/ +class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + */ + void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift); +}; +} +#endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 72a0ede29a..696364373d 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -260,7 +260,7 @@ v17.03.1 First Major public release of the sources - @ref arm_compute::NELogits1DMaxKernel, @ref arm_compute::NELogits1DShiftExpSumKernel, @ref arm_compute::NELogits1DNormKernel / @ref arm_compute::NESoftmaxLayer - @ref arm_compute::NEIm2ColKernel, @ref arm_compute::NECol2ImKernel, arm_compute::NEConvolutionLayerWeightsReshapeKernel / @ref arm_compute::NEConvolutionLayer - @ref arm_compute::NEGEMMMatrixAccumulateBiasesKernel / @ref arm_compute::NEFullyConnectedLayer - - @ref arm_compute::NEGEMMLowpMatrixMultiplyKernel / @ref arm_compute::NEGEMMLowp + - @ref arm_compute::NEGEMMLowpMatrixMultiplyKernel / arm_compute::NEGEMMLowp v17.03 Sources preview - New OpenCL kernels / functions: diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index ae5d456141..a29b661a00 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -132,7 +132,8 @@ NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel() void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); diff --git a/src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp deleted file mode 100644 index 255e486365..0000000000 --- a/src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp +++ /dev/null @@ -1,509 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include -#include - -using namespace arm_compute; - -namespace arm_compute -{ -class Coordinates; -} // namespace arm_compute - -template -void NEGEMMLowpFinalizeKernel::finalize(const Window &window) -{ - const int32x4_t c_offset_s32 = vdupq_n_s32(_c_offset); - const int32x4_t shift_s32 = vdupq_n_s32(-_shift); - - Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ); - - if(add_a_offset && add_b_offset) // true, true - { - // Set window for vector_sum_col - Window win_vector_sum_col(collapsed_window); - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - if(!_slide_vector_sum_col) - { - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - } - - // Set window for vector_sum_row - Window win_vector_sum_row(collapsed_window); - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col); - Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row); - Iterator mm_result(_mm_result, window); - Iterator out(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = - { - { - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 0), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 4), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 8), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset); - - // Compute the leftover term due to b_offset. - int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row.ptr()) + id.y()); - b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset); - - // Add a_offset_term_s32 and b_offset_term_s32 - int32x4x4_t offset_term_s32 = - { - { - vdupq_n_s32(_k_offset), - vdupq_n_s32(_k_offset), - vdupq_n_s32(_k_offset), - vdupq_n_s32(_k_offset) - } - }; - - offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32)); - offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32)); - offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32)); - offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32)); - - // Add c_offset - offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], c_offset_s32); - offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], c_offset_s32); - offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], c_offset_s32); - offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], c_offset_s32); - - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); - - // Multiply by c_mult_int - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32); - - // Convert S32 to U16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])), - } - }; - - // Convert S16 to S8 - const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - vst1q_s8(reinterpret_cast(out.ptr()), out_s8); - }, - vector_sum_col, vector_sum_row, mm_result, out); - } - else if(!add_a_offset && add_b_offset) // false, true - { - // Set window for vector_sum_row - Window win_vector_sum_row(collapsed_window); - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row); - Iterator mm_result(_mm_result, window); - Iterator out(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - // Compute the leftover term due to b_offset. - int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row.ptr()) + id.y()); - b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset); - - // Add b_offset_term_s32 and c_offset_term_s32 - int32x4_t offset_term_s32 = vaddq_s32(b_offset_term_s32, c_offset_s32); - - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32); - - // Multiply by c_mult_int - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32); - - // Convert S32 to U16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])), - } - }; - - // Convert S16 to S8 - const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - vst1q_s8(reinterpret_cast(out.ptr()), out_s8); - }, - vector_sum_row, mm_result, out); - } - else if(add_a_offset && !add_b_offset) // true, false - { - // Set window for vector_sum_col - Window win_vector_sum_col(collapsed_window); - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - if(!_slide_vector_sum_col) - { - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - } - - Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col); - Iterator mm_result(_mm_result, window); - Iterator out(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = - { - { - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 0), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 4), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 8), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset); - - // Add a_offset_term_s32 and b_offset_term_s32 - int32x4x4_t offset_term_s32 = - { - { - vaddq_s32(c_offset_s32, a_offset_term_s32.val[0]), - vaddq_s32(c_offset_s32, a_offset_term_s32.val[1]), - vaddq_s32(c_offset_s32, a_offset_term_s32.val[2]), - vaddq_s32(c_offset_s32, a_offset_term_s32.val[3]) - } - }; - - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); - - // Multiply by c_mult_int - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to S8 - const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - vst1q_s8(reinterpret_cast(out.ptr()), out_s8); - }, - vector_sum_col, mm_result, out); - } - else // false, false - { - Iterator mm_result(_mm_result, window); - Iterator out(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], c_offset_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], c_offset_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], c_offset_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], c_offset_s32); - - // Multiply by c_mult_int - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert U16 to S8 - const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - vst1q_s8(reinterpret_cast(out.ptr()), out_s8); - }, - mm_result, out); - } -} - -NEGEMMLowpFinalizeKernel::NEGEMMLowpFinalizeKernel() - : _func(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _c_offset(0), _k_offset(0), _c_mult_int(0), _shift(0), - _slide_vector_sum_col(true) -{ -} - -void NEGEMMLowpFinalizeKernel::configure(const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *mm_result, ITensor *output, int32_t num_mtx_a_cols, int32_t a_offset, - int32_t b_offset, - int32_t c_offset, int32_t c_mult_int, int32_t shift) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); - - TensorShape mm_result_shape = mm_result->info()->tensor_shape(); - TensorShape output_shape = output->info()->tensor_shape(); - - mm_result_shape.collapse(2); - output_shape.collapse(2); - - ARM_COMPUTE_ERROR_ON_MSG(mm_result_shape[2] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0)); - - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); - vector_sum_col_shape.collapse(1); - - // Check if vector_sum_col_shape should be slidden or not - // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col_shape[1] != 1; - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1)); - - TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape(); - vector_sum_row_shape.collapse(1); - - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); - vector_sum_col_shape.collapse(1); - - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 - && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _output = output; - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * num_mtx_a_cols; - _c_offset = c_offset; - _c_mult_int = c_mult_int; - _shift = shift; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration); - - // Accordingly with a_offset and b_offset, we can have 4 cases: - // a_offset != 0 && b_offset != 0 - // a_offset = 0 && b_offset != 0 - // a_offset != 0 && b_offset = 0 - // a_offset = 0 && b_offset = 0 - if(a_offset != 0 && b_offset != 0) - { - // Set the function to use - _func = &NEGEMMLowpFinalizeKernel::finalize; - - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - vector_sum_col_access, - vector_sum_row_access, - mm_result_access, - output_result_access); - } - else if(a_offset == 0 && b_offset != 0) - { - // Set the function to use - _func = &NEGEMMLowpFinalizeKernel::finalize; - - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - - update_window_and_padding(win, - vector_sum_row_access, - mm_result_access, - output_result_access); - } - else if(a_offset != 0 && b_offset == 0) - { - // Set the function to use - _func = &NEGEMMLowpFinalizeKernel::finalize; - - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - vector_sum_col_access, - mm_result_access, - output_result_access); - } - else - { - // Set the function to use - _func = &NEGEMMLowpFinalizeKernel::finalize; - - update_window_and_padding(win, - mm_result_access, - output_result_access); - } - - output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - INEKernel::configure(win); -} - -void NEGEMMLowpFinalizeKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - (this->*_func)(window); -} diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp index 4b9c9f3e64..1352f34e3c 100644 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp @@ -52,7 +52,7 @@ NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); @@ -127,115 +127,115 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo // All the values needed for computing a single 4x4 block will be read from consecutive memory positions execute_window_loop(window, [&](const Coordinates & id) { - auto *mtx_a0 = reinterpret_cast(ina.ptr()); - auto *mtx_b0 = reinterpret_cast(inb.ptr()); + const uint8_t *mtx_a0 = ina.ptr(); + const uint8_t *mtx_b0 = inb.ptr(); // Note: Since the input are all positives, we can use uint32_t // Accumulators for the block 0 - int32x4x4_t c0 = + uint32x4x4_t c0 = { { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) } }; // Accumulators for the block 1 - int32x4x4_t c1 = + uint32x4x4_t c1 = { { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) } }; // Accumulators for the block 2 - int32x4x4_t c2 = + uint32x4x4_t c2 = { { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) } }; // Accumulators for the block 3 - int32x4x4_t c3 = + uint32x4x4_t c3 = { { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) } }; for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) { - const int8x8_t a00_s8 = vld1_s8(mtx_a0); - const int8x16_t b00_s8 = vld1q_s8(mtx_b0); + const uint8x8_t a00_u8 = vld1_u8(mtx_a0); + const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); // Convert a00_s8 to uint16_t and get the lower part - const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - // Convert b00_s8 to int16_t - const int16x4x4_t b00_s16 = + // Convert b00_s8 to uint16_t + const uint16x4x4_t b00_u16 = { { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) + vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) } }; // 4x4 block 0 - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); // 4x4 block 1 - c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); - c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); - c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); - c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); + c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); + c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); + c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); + c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); // 4x4 block 2 - c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); - c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); - c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); - c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); + c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); + c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); + c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); + c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); // 4x4 block 3 - c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); - c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); - c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); - c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); + c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); + c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); + c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); + c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); } auto mtx_out = reinterpret_cast(out.ptr()); - vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); - vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); - vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); - vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); - vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); - vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); - vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); - vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); - vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); - vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); - vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); - vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); - vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); - vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); - vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); - vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); + vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); + vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); + vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); + vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); + vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); + vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); + vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); + vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); + vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); + vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); + vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); + vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); + vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); }, ina, inb, out); } diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp new file mode 100644 index 0000000000..bd550db54c --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) +{ +} + +void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0)); + + TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + vector_sum_col_shape.collapse(1); + + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _slide_vector_sum_col = vector_sum_col_shape[1] != 1; + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1)); + + TensorShape output_shape = mm_result->info()->tensor_shape(); + TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape(); + vector_sum_row_shape.collapse(1); + output_shape.collapse(2); + + ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + vector_sum_col_shape.collapse(1); + + ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 + && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration); + + // Accordingly with a_offset and b_offset, we can have 4 cases: + // a_offset != 0 && b_offset != 0 + // a_offset = 0 && b_offset != 0 + // a_offset != 0 && b_offset = 0 + // a_offset = 0 && b_offset = 0 + if(a_offset != 0 && b_offset != 0) + { + AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); + AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + vector_sum_col_access, + vector_sum_row_access, + mm_result_access); + } + else if(a_offset == 0 && b_offset != 0) + { + AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); + + update_window_and_padding(win, + vector_sum_row_access, + mm_result_access); + } + else if(a_offset != 0 && b_offset == 0) + { + AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + vector_sum_col_access, + mm_result_access); + } + else + { + update_window_and_padding(win, + mm_result_access); + } + + INEKernel::configure(win); +} + +void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ); + + if(_a_offset != 0 && _b_offset != 0) // true, true + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + if(!_slide_vector_sum_col) + { + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + } + + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col); + Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row); + Iterator mm_result(_mm_result, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = + { + { + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 0), + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 4), + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 8), + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 12) + } + }; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset); + + // Compute the leftover term due to b_offset. + int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row.ptr()) + id.y()); + b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset); + + // Add a_offset_term_s32 and b_offset_term_s32 + int32x4x4_t offset_term_s32 = + { + { + vdupq_n_s32(_k_offset), + vdupq_n_s32(_k_offset), + vdupq_n_s32(_k_offset), + vdupq_n_s32(_k_offset) + } + }; + + offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32)); + offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32)); + offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32)); + offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32)); + + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 0, in_s32.val[0]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 4, in_s32.val[1]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 8, in_s32.val[2]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 12, in_s32.val[3]); + }, + vector_sum_col, vector_sum_row, mm_result); + } + else if((_a_offset == 0) && (_b_offset != 0)) // false, true + { + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row); + Iterator mm_result(_mm_result, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Compute the leftover term due to b_offset. + int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row.ptr()) + id.y()); + b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset); + + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32); + + // Store the result with the offset contribution + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 0, in_s32.val[0]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 4, in_s32.val[1]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 8, in_s32.val[2]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 12, in_s32.val[3]); + }, + vector_sum_row, mm_result); + } + else if((_a_offset != 0) && (_b_offset == 0)) // true, false + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + if(!_slide_vector_sum_col) + { + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + } + + Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col); + Iterator mm_result(_mm_result, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = + { + { + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 0), + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 4), + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 8), + vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 12) + } + }; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset); + + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), + vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 0, in_s32.val[0]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 4, in_s32.val[1]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 8, in_s32.val[2]); + vst1q_s32(reinterpret_cast(mm_result.ptr()) + 12, in_s32.val[3]); + }, + vector_sum_col, mm_result); + } + else // false, false + { + // No offset contribution from matrix A and matrix B + return; + } +} diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp new file mode 100644 index 0000000000..aa3c280788 --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel() + : _input(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0) +{ +} + +void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + + _input = input; + _output = output; + _result_offset = result_offset; + _result_mult_int = result_mult_int; + _result_shift = result_shift; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + input_access, + output_result_access); + + output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(-_result_shift); + const int32x4_t zero_s32 = vdupq_n_s32(0); + + Iterator in(_input, window); + Iterator out(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(in.ptr()) + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); + + // Multiply by c_mult_int + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _result_mult_int); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _result_mult_int); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _result_mult_int); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _result_mult_int); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to U8 + const uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); + + vst1q_u8(out.ptr(), out_u8); + }, + in, out); +} \ No newline at end of file diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index 9df13ce0e3..81d9b5bb81 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -49,12 +49,12 @@ INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() { } -void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a_interleaved4x4, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) +void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a_interleaved4x4, 1, DataType::S8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - _input = mtx_a_interleaved4x4; + _input = mtx_a; _output = vector_sum_row; _k = num_mtx_a_cols; _is_reshaped = is_interleaved4x4; @@ -97,9 +97,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - int32x4_t sum_row = vdupq_n_s32(0); + uint32x4_t sum_row = vdupq_n_u32(0); - auto matrix_a = reinterpret_cast(in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); + const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); @@ -109,43 +109,43 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 4 accumulations for(; i <= (_k - 4); i += 4) { - const int8x16_t a0_s8 = vld1q_s8(matrix_a + i * 4); + const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4); // Convert U8 to U16 - int16x4x4_t a0_s16 = + uint16x4x4_t a0_u16 = { { - vget_low_s16(vmovl_s8(vget_low_s8(a0_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(a0_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(a0_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(a0_s8))) + vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(a0_u8))) } }; // Accumulate to U16 - a0_s16.val[0] = vadd_s16(a0_s16.val[0], a0_s16.val[1]); - a0_s16.val[0] = vadd_s16(a0_s16.val[0], a0_s16.val[2]); - a0_s16.val[0] = vadd_s16(a0_s16.val[0], a0_s16.val[3]); + a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]); + a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]); + a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]); // Accumulate to U32 - sum_row = vaddw_s16(sum_row, a0_s16.val[0]); + sum_row = vaddw_u16(sum_row, a0_u16.val[0]); } // This for loop performs the leftover accumulations for(; i < _k; ++i) { - const int8x8_t a0_s8 = vld1_s8(matrix_a + i * 4); + const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4); // Convert U8 to U16 - const int16x4_t a0_s16 = vget_low_s16(vmovl_s8(a0_s8)); + const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8)); // Accumulate to U32 - sum_row = vaddw_s16(sum_row, a0_s16); + sum_row = vaddw_u16(sum_row, a0_u16); } auto vector_sum_row = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_row, sum_row); + vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row)); }, in, out); } @@ -154,10 +154,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - int32x4_t sum_row_s32 = vdupq_n_s32(0); - int32_t sum_row = 0; + uint32x4_t sum_row_u32 = vdupq_n_u32(0); + uint32_t sum_row = 0; - auto matrix_a = reinterpret_cast(in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]); + const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); @@ -167,29 +167,29 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 16 accumulations for(; i <= (_k - 16); i += 16) { - const int8x16_t a0_s8 = vld1q_s8(matrix_a + i); + const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i); // Partial accumulations in U16 - const int16x8_t tmp_sum0 = vaddl_s8(vget_low_s8(a0_s8), vget_high_s8(a0_s8)); + const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8)); // Accumulate to U32 - sum_row_s32 = vaddq_s32(sum_row_s32, vpaddlq_s16(tmp_sum0)); + sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0)); } // This for loop performs the leftover accumulations for(; i < _k; ++i) { - sum_row += static_cast(matrix_a[i]); + sum_row += static_cast(matrix_a[i]); } #if defined(__aarch64__) // Reduction operation available on 64 bit architectures only - sum_row += vaddvq_s32(sum_row_s32); + sum_row += vaddvq_u32(sum_row_u32); #else // __aarch64__ - int32x2_t tmp = vpadd_s32(vget_high_s32(sum_row_s32), vget_low_s32(sum_row_s32)); - tmp = vpadd_s32(tmp, tmp); + uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32)); + tmp = vpadd_u32(tmp, tmp); - sum_row += vget_lane_s32(tmp, 0); + sum_row += vget_lane_u32(tmp, 0); #endif // __aarch64__ *(reinterpret_cast(out.ptr())) = static_cast(sum_row); @@ -198,12 +198,12 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf } } -void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b_transposed1xW, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) +void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b_transposed1xW, 1, DataType::S8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - _input = mtx_b_transposed1xW; + _input = mtx_b; _output = vector_sum_col; _k = num_mtx_b_rows; _is_reshaped = is_transposed1xW; @@ -246,17 +246,17 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - int32x4x4_t sum_col = + uint32x4x4_t sum_col = { { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) } }; - auto matrix_b = reinterpret_cast(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); + const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]; #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); @@ -265,14 +265,14 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf int i = 0; for(; i < _k; ++i) { - const int8x16_t b0_s8 = vld1q_s8(matrix_b + i * 16); + const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16); // Convert S8 to U16 - const int16x8x2_t b0_s16 = + const uint16x8x2_t b0_u16 = { { - vmovl_s8(vget_low_s8(b0_s8)), - vmovl_s8(vget_high_s8(b0_s8)) + vmovl_u8(vget_low_u8(b0_u8)), + vmovl_u8(vget_high_u8(b0_u8)) } }; @@ -280,20 +280,20 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf sum_col = { { - vaddw_s16(sum_col.val[0], vget_low_s16(b0_s16.val[0])), - vaddw_s16(sum_col.val[1], vget_high_s16(b0_s16.val[0])), - vaddw_s16(sum_col.val[2], vget_low_s16(b0_s16.val[1])), - vaddw_s16(sum_col.val[3], vget_high_s16(b0_s16.val[1])) + vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])), + vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])), + vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])), + vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1])) } }; } auto vector_sum_col = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_col + 0, sum_col.val[0]); - vst1q_s32(vector_sum_col + 4, sum_col.val[1]); - vst1q_s32(vector_sum_col + 8, sum_col.val[2]); - vst1q_s32(vector_sum_col + 12, sum_col.val[3]); + vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0])); + vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1])); + vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2])); + vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3])); }, in, out); } @@ -326,17 +326,17 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf } // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - int32x4x4_t sum_col = + uint32x4x4_t sum_col = { { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) } }; - auto matrix_b = reinterpret_cast(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]); + const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]; #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); @@ -347,10 +347,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 4 accumulations for(; i <= (_k - 4); i += 4) { - const int8x16_t b0_s8 = vld1q_s8(matrix_b + 0 * in_b_stride); - const int8x16_t b1_s8 = vld1q_s8(matrix_b + 1 * in_b_stride); - const int8x16_t b2_s8 = vld1q_s8(matrix_b + 2 * in_b_stride); - const int8x16_t b3_s8 = vld1q_s8(matrix_b + 3 * in_b_stride); + const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride); + const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride); + const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride); + const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride); #if __arm__ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); @@ -360,31 +360,31 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf #endif /* __arm__ */ // Partial accumulation in u16 - int16x8x2_t tmp_sum = + uint16x8x2_t tmp_sum = { { - vdupq_n_s16(0), - vdupq_n_s16(0) + vdupq_n_u16(0), + vdupq_n_u16(0) } }; - tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b0_s8)); - tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b1_s8)); - tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b2_s8)); - tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b3_s8)); - tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b0_s8)); - tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b1_s8)); - tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b2_s8)); - tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b3_s8)); + tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8)); + tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8)); + tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8)); + tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8)); + tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8)); + tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8)); + tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8)); + tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8)); // Accumulate to U32 sum_col = { { - vaddw_s16(sum_col.val[0], vget_low_s16(tmp_sum.val[0])), - vaddw_s16(sum_col.val[1], vget_high_s16(tmp_sum.val[0])), - vaddw_s16(sum_col.val[2], vget_low_s16(tmp_sum.val[1])), - vaddw_s16(sum_col.val[3], vget_high_s16(tmp_sum.val[1])) + vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])), + vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])), + vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])), + vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1])) } }; @@ -394,14 +394,14 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf // This for loop perfoms the leftover accumulations for(; i < _k; ++i) { - const int8x16_t b0_s8 = vld1q_s8(matrix_b + 0 * in_b_stride); + const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride); // Convert S8 to S16 - const int16x8x2_t b0_s16 = + const uint16x8x2_t b0_u16 = { { - vmovl_s8(vget_low_s8(b0_s8)), - vmovl_s8(vget_high_s8(b0_s8)) + vmovl_u8(vget_low_u8(b0_u8)), + vmovl_u8(vget_high_u8(b0_u8)) } }; @@ -409,10 +409,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf sum_col = { { - vaddw_s16(sum_col.val[0], vget_low_s16(b0_s16.val[0])), - vaddw_s16(sum_col.val[1], vget_high_s16(b0_s16.val[0])), - vaddw_s16(sum_col.val[2], vget_low_s16(b0_s16.val[1])), - vaddw_s16(sum_col.val[3], vget_high_s16(b0_s16.val[1])) + vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])), + vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])), + vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])), + vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1])) } }; @@ -421,10 +421,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf auto vector_sum_col = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_col + 0, sum_col.val[0]); - vst1q_s32(vector_sum_col + 4, sum_col.val[1]); - vst1q_s32(vector_sum_col + 8, sum_col.val[2]); - vst1q_s32(vector_sum_col + 12, sum_col.val[3]); + vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0])); + vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1])); + vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2])); + vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3])); }, inb, out); } diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 7f4ee1ec49..7f83144e12 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -43,7 +43,8 @@ using namespace arm_compute; void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp deleted file mode 100644 index 90bc6a205b..0000000000 --- a/src/runtime/NEON/functions/NEGEMMLowp.cpp +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -NEGEMMLowp::NEGEMMLowp(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _mm_func(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _finalize_kernel(), _vector_sum_col(), _vector_sum_row(), _mm_output(), _a_offset(0), - _b_offset(0) -{ -} - -void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t output_mult_int, int32_t shift) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN((a), 1, DataType::S8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - - _a_offset = a_offset; - _b_offset = b_offset; - - // Initialize matrix multiply output tensor - const TensorShape &shape_mm_output = output->info()->tensor_shape(); - TensorInfo info_mm_output(shape_mm_output, 1, DataType::S32); - _mm_output.allocator()->init(info_mm_output); - _memory_group.manage(&_mm_output); - - // Initialize Matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - TensorShape shape_vector_sum_col = b->info()->tensor_shape(); - shape_vector_sum_col.remove_dimension(1); - TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32); - _vector_sum_col.allocator()->init(info_vector_sum_col); - _memory_group.manage(&_vector_sum_col); - - // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false); - } - - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - TensorShape shape_vector_sum_row = a->info()->tensor_shape(); - shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1)); - shape_vector_sum_row.remove_dimension(1); - TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32); - _vector_sum_row.allocator()->init(info_vector_sum_row); - _memory_group.manage(&_vector_sum_row); - - // Configure Matrix A reduction kernel - _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false); - } - - // Configure matrix multiply function - _mm_func.configure(a, b, &_mm_output); - - // Configure finalize kernel - _finalize_kernel.configure(_a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, &_mm_output, output, a->info()->dimension(0), a_offset, b_offset, c_offset, - output_mult_int, shift); - - // Allocate tensors - _mm_output.allocator()->allocate(); - - if(_a_offset != 0) - { - _vector_sum_col.allocator()->allocate(); - } - - if(_b_offset != 0) - { - _vector_sum_row.allocator()->allocate(); - } -} - -void NEGEMMLowp::run() -{ - _memory_group.acquire(); - - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); - } - - // Run matrix multiply core function - _mm_func.run(); - - // Run finalise kernel - NEScheduler::get().schedule(&_finalize_kernel, Window::DimY); - - _memory_group.release(); -} diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 29104cc378..929ee41220 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -47,19 +47,25 @@ namespace arm_compute using namespace arm_compute; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace() + : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), + _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0) { } void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); + bool dot_product_path = false; + + _a_offset = a->info()->quantization_info().offset; + _b_offset = b->info()->quantization_info().offset; + #ifdef ARM_COMPUTE_AARCH64_V8_2 // Check for DOT product instruction const struct CPUInfo ci = NEScheduler::get().cpu_info(); @@ -67,6 +73,13 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, if(cpu_has_dotprod != 0) { + dot_product_path = true; + + // If the DOT product instruction is available, the computation will be performed in int8_t + // In order to take into account this, we need to subtract -128 from a_offset and b_offset + _a_offset -= 128; + _b_offset -= 128; + // Configure matrix multiply kernel struct CPUInfo ci = NEScheduler::get().cpu_info(); const int M = output->info()->tensor_shape().y(); @@ -77,12 +90,11 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, constexpr size_t alignment = 4096; _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); _memory_group.manage(&_workspace); + // Configure matrix multiplication kernel auto k = arm_compute::support::cpp14::make_unique(); k->configure(a, b, output, &_workspace, 1.f, 1.f); _mm_kernel = std::move(k); - - _workspace.allocator()->allocate(); } else #endif /* ARM_COMPUTE_AARCH64_V8_2 */ @@ -124,11 +136,58 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, k->configure(&_tmp_a, &_tmp_b, output); _mm_kernel = std::move(k); } + } - // Allocate tensors + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0) + { + TensorShape shape_vector_sum_col = b->info()->tensor_shape(); + shape_vector_sum_col.remove_dimension(1); + TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32); + _vector_sum_col.allocator()->init(info_vector_sum_col); + _memory_group.manage(&_vector_sum_col); + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false); + } + + // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 + if(_b_offset != 0) + { + TensorShape shape_vector_sum_row = a->info()->tensor_shape(); + shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1)); + shape_vector_sum_row.remove_dimension(1); + TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32); + _vector_sum_row.allocator()->init(info_vector_sum_row); + _memory_group.manage(&_vector_sum_row); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false); + } + + // Configure offset contribution kernel + _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset); + + // Allocate tensors + if(!dot_product_path) + { _tmp_a.allocator()->allocate(); _tmp_b.allocator()->allocate(); } + else + { + _workspace.allocator()->allocate(); + } + + if(_a_offset != 0) + { + _vector_sum_col.allocator()->allocate(); + } + + if(_b_offset != 0) + { + _vector_sum_row.allocator()->allocate(); + } } void NEGEMMLowpMatrixMultiplyCore::run() @@ -147,5 +206,20 @@ void NEGEMMLowpMatrixMultiplyCore::run() NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if(_b_offset != 0) + { + NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0) + { + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + // Run offset contribution kernel + NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp new file mode 100644 index 0000000000..d09827f908 --- /dev/null +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output, result_offset, result_mult_int, result_shift); + _kernel = std::move(k); +} \ No newline at end of file diff --git a/tests/benchmark/NEON/GEMMLowp.cpp b/tests/benchmark/NEON/GEMMLowp.cpp index 8cf143393d..a0e5e694bd 100644 --- a/tests/benchmark/NEON/GEMMLowp.cpp +++ b/tests/benchmark/NEON/GEMMLowp.cpp @@ -23,7 +23,6 @@ */ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "tests/NEON/Accessor.h" diff --git a/tests/datasets/GEMMLowpDataset.h b/tests/datasets/GEMMLowpDataset.h index 4bf2a98d61..062c05b1d9 100644 --- a/tests/datasets/GEMMLowpDataset.h +++ b/tests/datasets/GEMMLowpDataset.h @@ -37,7 +37,7 @@ namespace datasets class GEMMLowpDataset { public: - using type = std::tuple; + using type = std::tuple; struct iterator { @@ -45,18 +45,12 @@ public: std::vector::const_iterator b_it, std::vector::const_iterator c_it, std::vector::const_iterator a_offset_it, - std::vector::const_iterator b_offset_it, - std::vector::const_iterator c_offset_it, - std::vector::const_iterator c_mult_int_it, - std::vector::const_iterator out_shift_it) + std::vector::const_iterator b_offset_it) : _a_it{ std::move(a_it) }, _b_it{ std::move(b_it) }, _c_it{ std::move(c_it) }, _a_offset_it{ std::move(a_offset_it) }, - _b_offset_it{ std::move(b_offset_it) }, - _c_offset_it{ std::move(c_offset_it) }, - _c_mult_int_it{ std::move(c_mult_int_it) }, - _out_shift_it{ std::move(out_shift_it) } + _b_offset_it{ std::move(b_offset_it) } { } @@ -68,15 +62,12 @@ public: description << "C=" << *_c_it << ":"; description << "a_offset=" << *_a_offset_it << ":"; description << "b_offset=" << *_b_offset_it << ":"; - description << "c_offset=" << *_c_offset_it << ":"; - description << "c_mult_int=" << *_c_mult_int_it << ":"; - description << "out_shift=" << *_out_shift_it << ":"; return description.str(); } GEMMLowpDataset::type operator*() const { - return std::make_tuple(*_a_it, *_b_it, *_c_it, *_a_offset_it, *_b_offset_it, *_c_offset_it, *_c_mult_int_it, *_out_shift_it); + return std::make_tuple(*_a_it, *_b_it, *_c_it, *_a_offset_it, *_b_offset_it); } iterator &operator++() @@ -86,9 +77,6 @@ public: ++_c_it; ++_a_offset_it; ++_b_offset_it; - ++_c_offset_it; - ++_c_mult_int_it; - ++_out_shift_it; return *this; } @@ -99,32 +87,25 @@ public: std::vector::const_iterator _c_it; std::vector::const_iterator _a_offset_it; std::vector::const_iterator _b_offset_it; - std::vector::const_iterator _c_offset_it; - std::vector::const_iterator _c_mult_int_it; - std::vector::const_iterator _out_shift_it; }; iterator begin() const { - return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _a_offset.begin(), _b_offset.begin(), _c_offset.begin(), _c_mult_int.begin(), _out_shift.begin()); + return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _a_offset.begin(), _b_offset.begin()); } int size() const { - return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), std::min(_a_offset.size(), std::min(_b_offset.size(), std::min(_c_offset.size(), std::min(_c_mult_int.size(), - _out_shift.size()))))))); + return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), std::min(_a_offset.size(), _b_offset.size())))); } - void add_config(TensorShape a, TensorShape b, TensorShape c, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift) + void add_config(TensorShape a, TensorShape b, TensorShape c, int32_t a_offset, int32_t b_offset) { _a_shapes.emplace_back(std::move(a)); _b_shapes.emplace_back(std::move(b)); _c_shapes.emplace_back(std::move(c)); _a_offset.emplace_back(std::move(a_offset)); _b_offset.emplace_back(std::move(b_offset)); - _c_offset.emplace_back(std::move(c_offset)); - _c_mult_int.emplace_back(std::move(c_mult_int)); - _out_shift.emplace_back(std::move(out_shift)); } protected: @@ -137,9 +118,6 @@ private: std::vector _c_shapes{}; std::vector _a_offset{}; std::vector _b_offset{}; - std::vector _c_offset{}; - std::vector _c_mult_int{}; - std::vector _out_shift{}; }; } // namespace datasets } // namespace test diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h index 10f79e423d..cc1feb49a2 100644 --- a/tests/datasets/LargeGEMMLowpDataset.h +++ b/tests/datasets/LargeGEMMLowpDataset.h @@ -42,12 +42,12 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset public: LargeGEMMLowpDataset() { - add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, 0, 1, 0); - add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, 3, 2, 0); - add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, 1, 1, 0); - add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, -6, 2, 2); - add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, 8, 4, 3); - add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U), -9, 1, -3, 3, 1); + add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0); + add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4); + add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0); + add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13); + add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2); + add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U), -9, 1); } }; } // namespace datasets diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h index b7fe3907ad..881546e70f 100644 --- a/tests/datasets/SmallGEMMLowpDataset.h +++ b/tests/datasets/SmallGEMMLowpDataset.h @@ -42,12 +42,12 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset public: SmallGEMMLowpDataset() { - add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0, 0, 1, 0); - add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4, 3, 2, 0); - add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, 1, 1, 0); - add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13, -6, 2, 2); - add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 12U), -3, -2, 8, 4, 3); - add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, -3, 3, 1); + add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0); + add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4); + add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0); + add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13); + add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 12U), -3, -2); + add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1); } }; } // namespace datasets diff --git a/tests/validation/CPP/GEMMLowp.cpp b/tests/validation/CPP/GEMMLowp.cpp index e1d76503cd..bac3a20c8e 100644 --- a/tests/validation/CPP/GEMMLowp.cpp +++ b/tests/validation/CPP/GEMMLowp.cpp @@ -21,10 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "GEMM.h" +#include "GEMMLowp.h" #include "arm_compute/core/Types.h" -#include "tests/validation/FixedPoint.h" namespace arm_compute { @@ -34,17 +33,21 @@ namespace validation { namespace reference { -SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor &b, SimpleTensor &c) +template +SimpleTensor gemmlowp_matrix_multiply_core(const SimpleTensor &a, const SimpleTensor &b, int32_t a_offset, int32_t b_offset) { - ARM_COMPUTE_UNUSED(a); - ARM_COMPUTE_UNUSED(b); - ARM_COMPUTE_UNUSED(c); - const int K = a.shape().x(); - const int b_width = b.shape().x(); - const int rows = c.shape().y(); //M - const int cols = c.shape().x(); //N + TensorShape shape(b.shape()[0], a.shape()[1]); + + SimpleTensor c(shape, DataType::S32); + + const int K = a.shape().x(); + const int b_width = b.shape().x(); + const int rows = c.shape().y(); //M + const int cols = c.shape().x(); //N + std::vector acc; acc.resize(cols); + for(int i = 0; i < rows; ++i) { for(int j = 0; j < cols; ++j) @@ -53,10 +56,10 @@ SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor } for(int k = 0; k < K; ++k) { - auto tmp_a = static_cast(a[k + i * K]); + const int32_t tmp_a = a_offset + static_cast(a[k + i * K]); for(int j = 0; j < b_width; ++j) { - auto tmp_b = static_cast(b[j + k * b_width]); + const int32_t tmp_b = b_offset + static_cast(b[j + k * b_width]); const int32_t mult_as_int = tmp_a * tmp_b; acc[j] += mult_as_int; } @@ -71,43 +74,21 @@ SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor } template -SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor &b, SimpleTensor &c, - int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift) +SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) { - const int K = a.shape().x(); - const int b_width = b.shape().x(); - const int rows = c.shape().y(); //M - const int cols = c.shape().x(); //N - std::vector acc; - acc.resize(cols); - for(int i = 0; i < rows; ++i) + SimpleTensor dst(in.shape(), DataType::QASYMM8); + + for(int i = 0; i < in.num_elements(); ++i) { - for(int j = 0; j < cols; ++j) - { - acc[j] = 0; - } - for(int k = 0; k < K; ++k) - { - const int32_t tmp_a = a_offset + static_cast(a[k + i * K]); - for(int j = 0; j < b_width; ++j) - { - const int32_t tmp_b = b_offset + static_cast(b[j + k * b_width]); - const int32_t mult_as_int = tmp_a * tmp_b; - acc[j] += mult_as_int; - } - } - for(int j = 0; j < cols; ++j) - { - const int32_t result = ((c_offset + acc[j]) * c_mult_int) >> out_shift; - c[j + i * cols] = static_cast(std::min(127, std::max(-128, result))); - } + const int32_t result = ((in[i] + result_offset) * result_mult_int) >> result_shift; + dst[i] = static_cast(std::max(0, std::min(255, result))); } - return c; + return dst; } -template SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor &b, SimpleTensor &c, - int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift); +template SimpleTensor gemmlowp_matrix_multiply_core(const SimpleTensor &a, const SimpleTensor &b, int32_t a_offset, int32_t b_offset); +template SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/CPP/GEMMLowp.h b/tests/validation/CPP/GEMMLowp.h index 2f903f2fe2..c09d8f6176 100644 --- a/tests/validation/CPP/GEMMLowp.h +++ b/tests/validation/CPP/GEMMLowp.h @@ -35,11 +35,11 @@ namespace validation { namespace reference { -SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor &b, SimpleTensor &c); +template +SimpleTensor gemmlowp_matrix_multiply_core(const SimpleTensor &a, const SimpleTensor &b, int32_t a_offset, int32_t b_offset); template -SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor &b, SimpleTensor &c, - int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift); +SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp index 4924f98ea6..4407eff060 100644 --- a/tests/validation/NEON/GEMMLowp.cpp +++ b/tests/validation/NEON/GEMMLowp.cpp @@ -23,12 +23,15 @@ */ #include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h" #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "tests/NEON/Accessor.h" #include "tests/NEON/Helper.h" +#include "tests/PaddingCalculator.h" #include "tests/datasets/LargeGEMMLowpDataset.h" +#include "tests/datasets/ShapeDatasets.h" #include "tests/datasets/SmallGEMMLowpDataset.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" @@ -45,16 +48,13 @@ namespace validation { namespace { -const auto data_int_blk = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9); -const auto data_int_blk_tr = framework::dataset::make("M", 8, 17) * framework::dataset::make("N", 8, 14) * framework::dataset::make("by", 12) * framework::dataset::make("block", 4); -const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framework::dataset::make("N", 12, 20) * framework::dataset::make("K", 16); +const auto data_int_blk = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9); +const auto data_int_blk_tr = framework::dataset::make("M", 8, 17) * framework::dataset::make("N", 8, 14) * framework::dataset::make("by", 12) * framework::dataset::make("block", 4); } // namespace TEST_SUITE(NEON) TEST_SUITE(GEMMLowp) -TEST_SUITE(S8) - TEST_SUITE(INTERLEAVE_BLOCKED) using NEInterleaveBlocked = NESynthetizeFunction; @@ -77,50 +77,95 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleaveBlockedTransposedFixture, frame TEST_SUITE_END() -using NEGEMMLowpOffsetFixture = GEMMLowpOffsetValidationFixture; +TEST_SUITE(MatrixMultiplyCore) +using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture; -DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()), framework::dataset::make("DataType", - DataType::S8)), - shape_a, shape_b, shape_c, a_offset, b_offset, c_offset, c_mult_int, out_shift, data_type) +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()), + shape_a, shape_b, shape_c, a_offset, b_offset) { // Create tensors - Tensor a = create_tensor(shape_a, data_type); - Tensor b = create_tensor(shape_b, data_type); - Tensor c = create_tensor(shape_c, data_type); + Tensor a = create_tensor(shape_a, DataType::QASYMM8); + Tensor b = create_tensor(shape_b, DataType::QASYMM8); + Tensor c = create_tensor(shape_c, DataType::S32); + + a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset)); + b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset)); ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function - NEGEMMLowp gemmlowp; - gemmlowp.configure(&a, &b, &c, a_offset, b_offset, c_offset, c_mult_int, out_shift); + NEGEMMLowpMatrixMultiplyCore gemmlowp_mm; + gemmlowp_mm.configure(&a, &b, &c); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset()) +{ + // Validate output + validate(Accessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpOffsetFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpDataset(), framework::dataset::make("DataType", DataType::S8))) +FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset()) { // Validate output validate(Accessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpOffsetFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpDataset(), framework::dataset::make("DataType", DataType::S8))) +TEST_SUITE_END() // MatrixMultiplyCore + +TEST_SUITE(OutputStage) + +TEST_SUITE(QuantizeDownInt32ToUint8Scale) + +using NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture; + +const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -4, 4) * framework::dataset::make("result_mult_int", 1, 3) * framework::dataset::make("result_shift", 2, + 4); + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases), + shape, result_offset, result_mult_int, result_shift) +{ + // Create tensors + Tensor in = create_tensor(shape, DataType::S32); + Tensor out = create_tensor(shape, DataType::QASYMM8); + + ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + NEGEMMLowpQuantizeDownInt32ToUint8Scale output_stage; + output_stage.configure(&in, &out, result_offset, result_mult_int, result_shift); + + // Validate valid region + const ValidRegion valid_region = shape_to_valid_region(shape); + validate(in.info()->valid_region(), valid_region); + validate(out.info()->valid_region(), valid_region); + + // Validate padding + const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); + validate(in.info()->padding(), padding); + validate(out.info()->padding(), padding); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases)) { // Validate output validate(Accessor(_target), _reference); } -TEST_SUITE_END() // U8 -TEST_SUITE(S32) -using NEGEMMLowpMatrixMultiplyFixture = GEMMLowpMatrixMultiplyValidationFixture; -FIXTURE_DATA_TEST_CASE(MatrixMultiply, NEGEMMLowpMatrixMultiplyFixture, framework::DatasetMode::PRECOMMIT, data_matrix_multiply) +FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_cases)) { // Validate output validate(Accessor(_target), _reference); } -TEST_SUITE_END() -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale + +TEST_SUITE_END() // OutputStage + +TEST_SUITE_END() // GEMMLowp +TEST_SUITE_END() // NEON } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index fba44008ba..f9b0dbd959 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -43,36 +43,39 @@ namespace test namespace validation { template -class GEMMLowpOffsetValidationFixture : public framework::Fixture +class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture { public: template - void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift, DataType data_type) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, int32_t a_offset, int32_t b_offset) { - _target = compute_target(shape_a, shape_b, shape_c, a_offset, b_offset, c_offset, c_mult_int, out_shift, data_type); - _reference = compute_reference(shape_a, shape_b, shape_c, a_offset, b_offset, c_offset, c_mult_int, out_shift, data_type); + _target = compute_target(shape_a, shape_b, shape_c, a_offset, b_offset); + _reference = compute_reference(shape_a, shape_b, shape_c, a_offset, b_offset); } protected: template void fill(U &&tensor, int i) { - ARM_COMPUTE_ERROR_ON(tensor.data_type() != DataType::S8); - std::uniform_int_distribution<> distribution(0, 3); + // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path + std::uniform_int_distribution<> distribution(1, 254); library->fill(tensor, distribution, i); } TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, - int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift, DataType data_type) + int32_t a_offset, int32_t b_offset) { // Create tensors - TensorType a = create_tensor(shape_a, data_type, 1); - TensorType b = create_tensor(shape_b, data_type, 1); - TensorType c = create_tensor(shape_c, data_type, 1); + TensorType a = create_tensor(shape_a, DataType::QASYMM8, 1); + TensorType b = create_tensor(shape_b, DataType::QASYMM8, 1); + TensorType c = create_tensor(shape_c, DataType::S32, 1); + + a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset)); + b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset)); // Create and configure function FunctionType gemmlowp; - gemmlowp.configure(&a, &b, &c, a_offset, b_offset, c_offset, c_mult_int, out_shift); + gemmlowp.configure(&a, &b, &c); ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); @@ -90,108 +93,91 @@ protected: // Fill tensors fill(AccessorType(a), 0); fill(AccessorType(b), 1); - fill(AccessorType(c), 2); // Compute GEMM function gemmlowp.run(); return c; } - SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, - int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift, DataType data_type) + SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, + int32_t a_offset, int32_t b_offset) { // Create reference - SimpleTensor a{ shape_a, data_type, 1 }; - SimpleTensor b{ shape_b, data_type, 1 }; - SimpleTensor c{ shape_c, data_type, 1 }; + SimpleTensor a{ shape_a, DataType::QASYMM8, 1 }; + SimpleTensor b{ shape_b, DataType::QASYMM8, 1 }; // Fill reference fill(a, 0); fill(b, 1); - fill(c, 2); - return reference::gemmlowp(a, b, c, a_offset, b_offset, c_offset, c_mult_int, out_shift); + return reference::gemmlowp_matrix_multiply_core(a, b, a_offset, b_offset); } - TensorType _target{}; - SimpleTensor _reference{}; + TensorType _target{}; + SimpleTensor _reference{}; }; template -class GEMMLowpMatrixMultiplyValidationFixture : public framework::Fixture +class GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture : public framework::Fixture { public: template - void setup(size_t m, size_t n, size_t k) + void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) { - const TensorShape shape_a(k, m); - const TensorShape shape_b(n, k); - const TensorShape shape_c(n, m); - _target = compute_target(shape_a, shape_b, shape_c); - _reference = compute_reference(shape_a, shape_b, shape_c); + _target = compute_target(shape, result_offset, result_mult_int, result_shift); + _reference = compute_reference(shape, result_offset, result_mult_int, result_shift); } protected: template - void fill(U &&tensor, int i, int lo, int hi) + void fill(U &&tensor, int i) { - std::uniform_int_distribution<> distribution(lo, hi); + std::uniform_int_distribution<> distribution(-6000, 6000); library->fill(tensor, distribution, i); } - TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c) + TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) { // Create tensors - TensorType a = create_tensor(shape_a, DataType::S8, 1); - TensorType b = create_tensor(shape_b, DataType::S8, 1); - TensorType c = create_tensor(shape_c, DataType::S32, 1); + TensorType a = create_tensor(shape, DataType::S32, 1); + TensorType b = create_tensor(shape, DataType::QASYMM8, 1); // Create and configure function - FunctionType gemmlowp; - gemmlowp.configure(&a, &b, &c); + FunctionType output_stage; + output_stage.configure(&a, &b, result_offset, result_mult_int, result_shift); ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS); // Allocate tensors a.allocator()->allocate(); b.allocator()->allocate(); - c.allocator()->allocate(); ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS); // Fill tensors - fill(AccessorType(a), 0, -128, 127); - fill(AccessorType(b), 1, -128, 127); - fill(AccessorType(c), 2, 0, 0); + fill(AccessorType(a), 0); // Compute GEMM function - gemmlowp.run(); - return c; + output_stage.run(); + return b; } - SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c) + SimpleTensor compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) { // Create reference - SimpleTensor a{ shape_a, DataType::S8, 1 }; - SimpleTensor b{ shape_b, DataType::S8, 1 }; - SimpleTensor c{ shape_c, DataType::S32, 1 }; + SimpleTensor a{ shape, DataType::S32, 1 }; // Fill reference - fill(a, 0, -128, 127); - fill(b, 1, -128, 127); - fill(c, 2, 0, 0); + fill(a, 0); - return reference::gemmlowp(a, b, c); + return reference::gemmlowp_quantize_down_int32_to_uint8_scale(a, result_offset, result_mult_int, result_shift); } TensorType _target{}; - SimpleTensor _reference{}; + SimpleTensor _reference{}; }; - } // namespace validation } // namespace test } // namespace arm_compute -- cgit v1.2.1