From dbdea0d1c025b18d4d82c278c87454427918f5b4 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 16 Oct 2019 19:21:40 +0100 Subject: COMPMID-2308: NEConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters Change-Id: Ic1bf5f0d21ccd525f84213a360f7e199d7f50577 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/2177 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- arm_compute/core/NEON/NEAsymm.h | 99 +++++ arm_compute/core/NEON/NEKernels.h | 1 + arm_compute/core/NEON/NEMath.h | 11 + arm_compute/core/NEON/NEMath.inl | 8 + .../kernels/NEConvertQuantizedSignednessKernel.h | 76 ++++ .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h | 8 +- .../NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h | 10 +- .../core/NEON/kernels/NEGEMMLowpReductionKernel.h | 29 +- .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h | 6 +- .../core/NEON/kernels/NEWeightsReshapeKernel.h | 4 +- arm_compute/core/NEON/wrapper/intrinsics/add.h | 82 +++- arm_compute/core/NEON/wrapper/intrinsics/eor.h | 56 +++ .../core/NEON/wrapper/intrinsics/intrinsics.h | 2 + .../core/NEON/wrapper/intrinsics/reinterpret.h | 43 ++ arm_compute/core/NEON/wrapper/traits.h | 16 + arm_compute/core/QuantizationInfo.h | 30 ++ arm_compute/core/Types.h | 13 +- arm_compute/core/Utils.h | 5 + arm_compute/core/utils/quantization/AsymmHelpers.h | 15 + .../NEON/functions/NEGEMMConvolutionLayer.h | 14 +- .../runtime/NEON/functions/NEGEMMInterleave4x4.h | 4 +- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 51 ++- .../runtime/NEON/functions/NEGEMMTranspose1xW.h | 6 +- .../kernels/NEConvertQuantizedSignednessKernel.cpp | 136 +++++++ .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 4 +- .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp | 5 +- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 438 +++++++++++++++++---- .../NEON/kernels/NEGEMMLowpReductionKernel.cpp | 275 +++++++------ src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 2 +- src/core/NEON/kernels/NEWeightsReshapeKernel.cpp | 2 +- src/core/Utils.cpp | 4 + src/core/utils/quantization/AsymmHelpers.cpp | 38 ++ .../NEON/functions/NEGEMMAssemblyDispatch.cpp | 16 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 88 ++--- src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp | 7 +- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 148 +++++-- tests/AssetsLibrary.h | 1 + tests/Utils.h | 1 + tests/validation/Helpers.cpp | 18 + tests/validation/Helpers.h | 18 + tests/validation/NEON/ConvolutionLayer.cpp | 37 ++ .../validation/fixtures/ConvolutionLayerFixture.h | 77 +++- tests/validation/reference/Convolution3d.h | 51 ++- tests/validation/reference/ConvolutionLayer.cpp | 20 +- tests/validation/reference/ConvolutionLayer.h | 4 +- 45 files changed, 1592 insertions(+), 387 deletions(-) create mode 100644 arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/eor.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h create mode 100644 src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index 56d4c09f92..a3bd7e28f0 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -115,6 +115,66 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32, return out_u8; } +/** Performs final quantization step on 16 elements for symmetric quantization + * + * @tparam is_bounded_relu Specified if a fused bounded relu should be applied + * + * @param in_s32 Input to be quantized. + * @param result_fixedpoint_multiplier Result multiplier parameter + * @param result_shift Result shift parameter + * @param result_offset_after_shift_s32 Result offset parameter + * @param min_s8 Relu lower bound + * @param max_s8 Relu upper bound + * + * @return Quantized values + */ +template +inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, + const int32x4x4_t &result_fixedpoint_multiplier, + const int32x4x4_t &result_shift, + const int32x4_t &result_offset_after_shift_s32, + const int8x16_t &min_s8, + const int8x16_t &max_s8) +{ + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_s32.val[0] = vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]); + in_s32.val[1] = vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]); + in_s32.val[2] = vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]); + in_s32.val[3] = vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]); + + // Round to the nearest division by a power-of-two using result_shift_s32 + in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift.val[0]); + in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift.val[1]); + in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift.val[2]); + in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift.val[3]); + + // Add the offset terms + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + /** Performs final quantization step on single element * * @tparam is_bounded_relu Specified if a fused bounded relu should be applied @@ -154,6 +214,45 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul return out_u8; } +/** Performs final quantization step on single element + * + * @tparam is_bounded_relu Specified if a fused bounded relu should be applied + * + * @param[in] in_value Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * + * @return Quantized value + */ +template +inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, + int32_t result_shift, int32_t result_offset_after_shift_s32, + int8_t min_s8, int8_t max_s8) +{ + int32x4_t in_s32 = vdupq_n_s32(in_value); + + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + + // Shift value by result_shift_s32 + in_value = rounding_divide_by_pow2(in_value, result_shift); + + // Add the offset term + in_value += result_offset_after_shift_s32; + + // Bound the result + int8_t out_s8 = static_cast(std::max(-128, std::min(127, in_value))); + if(is_bounded_relu) + { + out_s8 = static_cast(std::max(min_s8, std::min(max_s8, out_s8))); + } + + return out_s8; +} + /** Dequantize a neon vector holding 8 quantized values. * * @param[in] qv Input values to be dequantized. diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 33a640fa05..aa46a346e9 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -46,6 +46,7 @@ #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h" #include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h" #include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h" +#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h" #include "arm_compute/core/NEON/kernels/NECopyKernel.h" #include "arm_compute/core/NEON/kernels/NECropKernel.h" diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h index 560abd6cdc..8593059b1a 100644 --- a/arm_compute/core/NEON/NEMath.h +++ b/arm_compute/core/NEON/NEMath.h @@ -124,6 +124,17 @@ float32x4_t vtanhq_f32(float32x4_t val); */ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Vector of 4 elements + * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent); + /** Round to the nearest division by a power-of-two using exponent * * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 61315e8dbc..f1c9c2024b 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -294,6 +294,14 @@ inline float32x2_t vsin_f32(float32x2_t val) #endif /* DOXYGEN_SKIP_THIS */ +inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent) +{ + const int32x4_t shift_vec = vnegq_s32(exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) { const int32x4_t shift_vec = vdupq_n_s32(-exponent); diff --git a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h new file mode 100644 index 0000000000..d3f0907273 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__ +#define __ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** NEON kernel to convert asymmetric signed to asymmetric signed and vice-versa */ +class NEConvertQuantizedSignednessKernel : public INEKernel +{ +public: + const char *name() const override + { + return "NEConvertQuantizedSignednessKernel"; + } + /** Default constructor */ + NEConvertQuantizedSignednessKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete; + /** Allow instances of this class to be moved */ + NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default; + /** Allow instances of this class to be moved */ + NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. Data types supported: opposite of @p input. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel + * + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] output Destination tensor. Data types supported: opposite of @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h index 5c0104d138..f470126844 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -60,13 +60,13 @@ public: NEGEMMInterleave4x4Kernel(); /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: All * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. * * @return a status @@ -79,7 +79,7 @@ public: private: /** Common signature for all the transpose functions * - * @param[in] input An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input An input tensor. Data types supported: All * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h index 354ae21e29..6467a8dd08 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -62,15 +62,15 @@ public: * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two * kernels change the layout of the original matrices to be more cache-friendly. * - * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8 - * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0 + * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED + * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 */ void configure(const ITensor *input0, const ITensor *input1, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel * - * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8 - * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0 + * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED + * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32 * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h index 4eab86d00a..ce3dddbe19 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -47,7 +47,7 @@ public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data type supported: QASYMM8 + * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 * @param[in] k Number of matrix A columns (or matrix B rows) * @param[in] is_reshaped True if the input tensor has been reshaped @@ -75,7 +75,7 @@ public: } /** Initialise the kernel's input and output. * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 * @param[in] num_mtx_a_cols Number of matrix A columns * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 @@ -83,7 +83,7 @@ public: void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override; /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 * @param[in] num_mtx_a_cols Number of matrix A columns * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 @@ -94,6 +94,14 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] window Execution window + */ + template + void run_internal(const Window &window); }; /** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. @@ -110,7 +118,7 @@ public: } /** Initialise the kernel's input and output. * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 * @param[in] num_mtx_b_rows Number of matrix B rows * @param[in] is_transposed1xW True if the input tensor is transposed 1xW @@ -118,7 +126,7 @@ public: void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override; /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 * @param[in] num_mtx_b_rows Number of matrix B rows * @param[in] is_transposed1xW True if the input tensor is transposed 1xW @@ -129,6 +137,15 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] window Execution window + * @param[in] info Thread-related information + */ + template + void run_internal(const Window &window, const ThreadInfo &info); }; } // namespace arm_compute diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h index b7fbfcfcd2..54086d1c24 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -74,13 +74,13 @@ public: } /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor info. Data type supported: same as @p input. * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h index bba18a8fa8..585c707bb6 100644 --- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h +++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h @@ -75,7 +75,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/FP16/F32 * @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. @@ -85,7 +85,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h index 4f4d244489..1839170485 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/add.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/add.h @@ -63,13 +63,13 @@ VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #undef VADD_IMPL +// VQADD: Vector saturating add (No notion of saturation for floating point) #define VQADD_IMPL(stype, vtype, prefix, postfix) \ inline vtype vqadd(const vtype &a, const vtype &b) \ { \ return prefix##_##postfix(a, b); \ } -// VQADD: Vector saturating add (No notion of saturation for floating point) VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8) VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8) VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16) @@ -96,6 +96,86 @@ VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32) VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #undef VQADD_IMPL + +// VADDW: Vector widening add +#define VADDW_IMPL(wtype, vtype, prefix, postfix) \ + inline wtype vaddw(const wtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8) +VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8) +VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16) +VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16) +VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32) +VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32) +#undef VADDW_IMPL + +// VADDL: Vector long add +#define VADDL_IMPL(wtype, vtype, prefix, postfix) \ + inline wtype vaddl(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8) +VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8) +VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16) +VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16) +VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32) +VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32) +#undef VADDL_IMPL + +#if defined(__aarch64__) +// VADDV: Across vector add +#define VADDV_IMPL(stype, vtype, prefix, postfix) \ + inline stype vaddv(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8) +VADDV_IMPL(int8_t, int8x8_t, vaddv, s8) +VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16) +VADDV_IMPL(int16_t, int16x4_t, vaddv, s16) +VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32) +VADDV_IMPL(int32_t, int32x2_t, vaddv, s32) +VADDV_IMPL(float, float32x2_t, vaddv, f32) + +VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8) +VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8) +VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16) +VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16) +VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32) +VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32) +VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64) +VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64) +VADDV_IMPL(float, float32x4_t, vaddvq, f32) +#undef VADDV_IMPL +#endif // defined(__aarch64__) + +// VPADDL: Signed add long pairwise +#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \ + inline ltype vpaddl(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8) +VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8) +VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16) +VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16) +VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32) +VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32) + +VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8) +VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8) +VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16) +VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16) +VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32) +VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32) +#undef VPADDL_IMPL } // namespace wrapper } // namespace arm_compute #endif /* __ARM_COMPUTE_WRAPPER_ADD_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/arm_compute/core/NEON/wrapper/intrinsics/eor.h new file mode 100644 index 0000000000..1002808327 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/eor.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_EOR_H__ +#define __ARM_COMPUTE_WRAPPER_EOR_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VEOR_IMPL(vtype, prefix, postfix) \ + inline vtype veor(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VEOR_IMPL(uint8x8_t, veor, u8) +VEOR_IMPL(int8x8_t, veor, s8) +VEOR_IMPL(uint16x4_t, veor, u16) +VEOR_IMPL(int16x4_t, veor, s16) +VEOR_IMPL(uint32x2_t, veor, u32) +VEOR_IMPL(int32x2_t, veor, s32) + +VEOR_IMPL(uint8x16_t, veorq, u8) +VEOR_IMPL(int8x16_t, veorq, s8) +VEOR_IMPL(uint16x8_t, veorq, u16) +VEOR_IMPL(int16x8_t, veorq, s16) +VEOR_IMPL(uint32x4_t, veorq, u32) +VEOR_IMPL(int32x4_t, veorq, s32) + +#undef VEOR_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_EOR_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h index 0362ca125f..6eae1cf801 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h @@ -35,6 +35,7 @@ #include "arm_compute/core/NEON/wrapper/intrinsics/combine.h" #include "arm_compute/core/NEON/wrapper/intrinsics/div.h" #include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/eor.h" #include "arm_compute/core/NEON/wrapper/intrinsics/exp.h" #include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h" #include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h" @@ -56,6 +57,7 @@ #include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h" #include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h" #include "arm_compute/core/NEON/wrapper/intrinsics/pow.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h" #include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h" #include "arm_compute/core/NEON/wrapper/intrinsics/round.h" #include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h" diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h new file mode 100644 index 0000000000..0956959e20 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_REINTERPRET_H__ +#define __ARM_COMPUTE_WRAPPER_REINTERPRET_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +inline int32x4_t vreinterpret_s32(const uint32x4_t &val) +{ + return vreinterpretq_s32_u32(val); +} +inline int32x4_t vreinterpret_s32(const int32x4_t &val) +{ + return val; +} +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_REINTERPRET_H__ */ diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h index cc22597c29..d700aacaad 100644 --- a/arm_compute/core/NEON/wrapper/traits.h +++ b/arm_compute/core/NEON/wrapper/traits.h @@ -112,6 +112,22 @@ template <> struct neon_bitvector{ using type = float template using neon_bitvector_t = typename neon_bitvector::type; /** Helper type template to get the tag type of a neon vector */ template using neon_bitvector_tag_t = typename neon_bitvector::tag_type; + +/** Promote a type */ +template struct promote { }; +template <> struct promote { using type = uint16_t; }; +template <> struct promote { using type = int16_t; }; +template <> struct promote { using type = uint32_t; }; +template <> struct promote { using type = int32_t; }; +template <> struct promote { using type = uint64_t; }; +template <> struct promote { using type = int64_t; }; +template <> struct promote { using type = float; }; +template <> struct promote { using type = half; }; + +/** Get promoted type */ +template +using promote_t = typename promote::type; + // clang-format on // *INDENT-ON* } // namespace traits diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h index 5e6e5b349c..949ee66b7c 100644 --- a/arm_compute/core/QuantizationInfo.h +++ b/arm_compute/core/QuantizationInfo.h @@ -250,6 +250,36 @@ inline int8_t quantize_qsymm8(float value, const QuantizationInfo &qinfo) return quantized; } +/** Quantize a value given a 8-bit symmetric per channel quantization scheme + * + * @param[in] value Value to quantize + * @param[in] qinfo Quantization information to use for quantizing + * @param[in] channel_id channel index into the scale vector of quantization info + * + * @return Quantized value + */ +inline int8_t quantize_qsymm8_per_channel(float value, const QuantizationInfo &qinfo, size_t channel_id = 0) +{ + int quantized = arm_compute::round(value / qinfo.scale()[channel_id], RoundingPolicy::TO_NEAREST_UP); + quantized = std::max(-128, std::min(quantized, 127)); + return quantized; +} + +/** Quantize a value given a 8-bit asymmetric per channel quantization scheme + * + * @param[in] value Value to quantize + * @param[in] qinfo Quantization information to use for quantizing + * @param[in] channel_id channel index into the scale vector of quantization info + * + * @return Quantized value + */ +inline int8_t quantize_qasymm8_per_channel(float value, const QuantizationInfo &qinfo, size_t channel_id = 0) +{ + int quantized = arm_compute::round(value / qinfo.scale()[channel_id], RoundingPolicy::TO_NEAREST_UP); + quantized = std::max(0, std::min(quantized, 255)); + return quantized; +} + /** Dequantize a value given a 8-bit asymmetric quantization scheme * * @param[in] value Value to dequantize diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index f4955ed457..1c9e8ce891 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -77,7 +77,8 @@ enum class DataType U8, /**< unsigned 8-bit number */ S8, /**< signed 8-bit number */ QSYMM8, /**< quantized, symmetric fixed-point 8-bit number */ - QASYMM8, /**< quantized, asymmetric fixed-point 8-bit number */ + QASYMM8, /**< quantized, asymmetric fixed-point 8-bit number unsigned */ + QASYMM8_SIGNED, /**< quantized, asymmetric fixed-point 8-bit number signed */ QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */ QASYMM8_PER_CHANNEL, /**< quantized, asymmetric per channel fixed-point 8-bit number */ U16, /**< unsigned 16-bit number */ @@ -1881,6 +1882,8 @@ struct GEMMLowpOutputStageInfo int gemmlowp_shift{ 0 }; /**< GEMMLowp output stage shift used for quantizing to uint8 */ int gemmlowp_min_bound{ 0 }; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */ int gemmlowp_max_bound{ 0 }; /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */ + std::vector gemmlowp_multipliers{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */ + std::vector gemmlowp_shifts{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */ }; /** GEMM LHS (Left Hand Side) matrix information */ @@ -2015,6 +2018,14 @@ public: { return _gemmlowp_output_stage; }; + /** Sets GEMMLowp output stage + * + * @param[in] output_stage Output stage to set + */ + void set_gemmlowp_output_stage(GEMMLowpOutputStageInfo &output_stage) + { + _gemmlowp_output_stage = output_stage; + }; /** Flag which specifies if a wider accumulator should be used. * * @return True if a wider accumulator has to be used diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 3939491bb2..a6e1ea1a89 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -114,6 +114,7 @@ inline size_t data_size_from_type(DataType data_type) case DataType::S8: case DataType::QSYMM8: case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: case DataType::QASYMM8_PER_CHANNEL: return 1; @@ -191,6 +192,7 @@ inline size_t element_size_from_data_type(DataType dt) case DataType::U8: case DataType::QSYMM8: case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: return 1; case DataType::U16: @@ -533,6 +535,7 @@ inline DataType get_promoted_data_type(DataType dt) return DataType::S32; case DataType::QSYMM8: case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: case DataType::QASYMM8_PER_CHANNEL: case DataType::QSYMM16: @@ -1024,6 +1027,7 @@ inline bool is_data_type_quantized(DataType dt) { case DataType::QSYMM8: case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: case DataType::QASYMM8_PER_CHANNEL: case DataType::QSYMM16: @@ -1045,6 +1049,7 @@ inline bool is_data_type_quantized_asymmetric(DataType dt) switch(dt) { case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: case DataType::QASYMM8_PER_CHANNEL: case DataType::QASYMM16: return true; diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h index 8ec4a331f6..6b6cb007e3 100644 --- a/arm_compute/core/utils/quantization/AsymmHelpers.h +++ b/arm_compute/core/utils/quantization/AsymmHelpers.h @@ -59,6 +59,21 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int *quant * @return a status */ Status calculate_quantized_multiplier_greater_than_one(float multiplier, int *quantized_multiplier, int *left_shift); + +/** Calculate quantized representation of per-channel multipliers with value less than one. + * + * @param[in] iq_info Input quantization info. + * @param[in] wq_info Weights quantization info. + * @param[in] oq_info Output quantization info. + * @param[in, out] stage_info GemmLowp output stage info + * + * @return a status + */ +Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info, + const QuantizationInfo &wq_info, + const QuantizationInfo &oq_info, + GEMMLowpOutputStageInfo &stage_info); + /** Get minimum and maximum values for the input quantized data type * * @return min and max values for the quantized data type diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 3e551abf5a..2d4aaa495f 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -63,14 +63,14 @@ public: NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = default; /** Set the input and output tensors. * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. Data types supported: Same as @p weights. */ void configure(const ITensor *weights, const ITensor *biases, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[in] output Destination tensor. Data types supported: Same as @p weights. * @@ -158,8 +158,8 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * Data types supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. @@ -178,7 +178,7 @@ public: * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. @@ -203,7 +203,7 @@ private: /** Configures the appropriate matrix multiply routine * * @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[out] output Output tensor. Data types supported: Same as @p input, @@ -215,7 +215,7 @@ private: /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines * * @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[in] output Output tensor. Data types supported: Same as @p input, diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h index 4d7f67b949..ec56d752b5 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,7 +40,7 @@ class NEGEMMInterleave4x4 : public INESimpleFunctionNoBorder public: /** Initialise the kernel's inputs, output * - * @param[in] input First input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input First input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ITensor *input, ITensor *output); diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 12c120934e..aa2c23c97c 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -26,6 +26,8 @@ #include "NEActivationLayer.h" #include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" +#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" @@ -76,24 +78,24 @@ public: * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. * -# Compute the matrix product of the resulting a * b in int32. * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8 otherwise + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise * - * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8 + * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8 otherwise + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise * - * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8. + * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8 + * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run * @@ -116,21 +118,28 @@ private: NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; NEActivationLayer _activation_func; - Tensor _vector_sum_col; - Tensor _vector_sum_row; - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _mm_result_s32; - const ITensor *_original_b; - int32_t _a_offset; - int32_t _b_offset; - bool _run_vector_matrix_multiplication; - bool _assembly_path; - bool _fused_assembly_path; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _fuse_output_stage; - bool _run_activation; + NEConvertQuantizedSignednessKernel _convert_to_signed_asymm; + NEConvertQuantizedSignednessKernel _convert_from_signed_asymm; + + Tensor _vector_sum_col; + Tensor _vector_sum_row; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _mm_result_s32; + Tensor _signed_a; + Tensor _signed_output; + const ITensor *_original_b; + int32_t _a_offset; + int32_t _b_offset; + + bool _run_vector_matrix_multiplication; + bool _assembly_path; + bool _fused_assembly_path; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _fuse_output_stage; + bool _run_activation; + bool _flip_signedness; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h index b44c5a3ee3..f5ba08bdd1 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,13 +40,13 @@ class NEGEMMTranspose1xW : public INESimpleFunctionNoBorder public: /** Initialise the kernel's inputs, output * - * @param[in] input First input tensor. Data type supported: U8/S8/U16/S16/F16/U32/S32/F32/ + * @param[in] input First input tensor. Data type supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xW * - * @param[in] input First input tensor. Data type supported: U8/S8/U16/S16/F16/U32/S32/F32/ + * @param[in] input First input tensor. Data type supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor. Data type supported: same as @p input * * @return a status diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp new file mode 100644 index 0000000000..39e030e434 --- /dev/null +++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + + // Validate output if initialized + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Output auto inizialitation if not yet initialized + { + const bool is_input_signed = input->data_type() == DataType::QASYMM8_SIGNED; + const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo qinfo = input->quantization_info().uniform(); + const int offset_correction = is_input_signed ? -128 : 128; + const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); + + auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); + } + + return std::make_pair(Status{}, calculate_max_window(*output)); +} +} // namespace + +NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + std::pair win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + return Status{}; +} + +void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + const uint8_t mask = 128; + const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{}); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const uint8_t in = *(reinterpret_cast(input_ptr + x)); + *(output_ptr + x) = in ^ mask; + } + }, + input, output); +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index c929983162..a9c04824ae 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -45,9 +45,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, - DataType::U16, DataType::S16, DataType::U32, DataType::S32, - DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); if(output->total_size() != 0) diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp index 6cec51d5a2..8f5a208cbb 100644 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp @@ -722,8 +722,8 @@ namespace { Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); TensorShape in0_shape = input0->tensor_shape(); @@ -917,6 +917,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo switch(_input0->info()->data_type()) { case DataType::S8: + case DataType::QASYMM8_SIGNED: { matrix_multiply_s8(ina, inb, out, width_b, out_stride, window); break; diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index 46e53cec12..3ada3a3c4f 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -72,6 +72,58 @@ inline int32x4x4_t load(const int32_t *ptr, int32_t x) }; } +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) +{ + return + { + { + vaddq_s32(a.val[0], b), + vaddq_s32(a.val[1], b), + vaddq_s32(a.val[2], b), + vaddq_s32(a.val[3], b) + } + }; +} + +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) +{ + return + { + { + vaddq_s32(a.val[0], b.val[0]), + vaddq_s32(a.val[1], b.val[1]), + vaddq_s32(a.val[2], b.val[2]), + vaddq_s32(a.val[3], b.val[3]) + } + }; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) +{ + return + { + { + vmulq_n_s32(a.val[0], mul_scalar), + vmulq_n_s32(a.val[1], mul_scalar), + vmulq_n_s32(a.val[2], mul_scalar), + vmulq_n_s32(a.val[3], mul_scalar) + } + }; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier) +{ + return + { + { + vmulq_s32(a.val[0], vld1q_s32(multilpier)), + vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), + vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), + vmulq_s32(a.val[3], vld1q_s32(multilpier + 12)) + } + }; +} + inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x) { int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x); @@ -141,6 +193,82 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3 return out_u8; } +template +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +template +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0])); + in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1])); + in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2])); + in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3])); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + inline Window get_win_vector_sum(const Window &window) { Window win_vector_sum(window); @@ -172,50 +300,12 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias) return bias_it; } -inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) -{ - return - { - { - vaddq_s32(a.val[0], b), - vaddq_s32(a.val[1], b), - vaddq_s32(a.val[2], b), - vaddq_s32(a.val[3], b) - } - }; -} - -inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) -{ - return - { - { - vaddq_s32(a.val[0], b.val[0]), - vaddq_s32(a.val[1], b.val[1]), - vaddq_s32(a.val[2], b.val[2]), - vaddq_s32(a.val[3], b.val[3]) - } - }; -} - -inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) -{ - return - { - { - vmulq_n_s32(a.val[0], mul_scalar), - vmulq_n_s32(a.val[1], mul_scalar), - vmulq_n_s32(a.val[2], mul_scalar), - vmulq_n_s32(a.val[3], mul_scalar) - } - }; -} - template inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, int32_t a_offset, int32_t b_offset, int32_t k_offset, - GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x) + int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, + int window_step_x, int window_start_x, int window_end_x) { int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; if(!is_fixed_point) @@ -251,12 +341,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su } if(!is_fixed_point) { - in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier); + in_s32 = mul_s32(in_s32, multiplier); } if(is_fixed_point) { - vst1q_u8(out_it.ptr() + x, finalize_quantization(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8)); + vst1q_u8(out_it.ptr() + x, finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8)); } else { @@ -280,24 +370,99 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, - output_stage.gemmlowp_offset, static_cast(output_stage.gemmlowp_min_bound), static_cast(output_stage.gemmlowp_max_bound)); + *(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, static_cast(min_bound), static_cast(max_bound)); } else { // Finalize quantization - in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift; + in_value = (in_value * multiplier) >> shift; // Bound and store the result if(is_bounded_relu) { - in_value = static_cast(std::max(output_stage.gemmlowp_min_bound, std::min(output_stage.gemmlowp_max_bound, in_value))); + in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); } *(out_it.ptr() + x) = static_cast(std::max(0, std::min(255, in_value))); } } } +template +inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, + const int32_t *result_multipliers, const int32_t *result_shifts, + const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8, + int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound, + int window_step_x, int window_start_x, int window_end_x) +{ + int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; + if(!is_fixed_point) + { + // Combine quantization offset with other offsets. + offset_term_s32 = add_s32(offset_term_s32, result_offset); + } + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = load_results_input(mm_result_it, x); + + if(has_a_offset) + { + in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); + } + if(has_bias) + { + in_s32 = add_s32(in_s32, load(bias_ptr, x)); + } + if(!is_fixed_point) + { + in_s32 = add_s32(in_s32, offset_term_s32); + in_s32 = mul_s32(in_s32, result_multipliers + x); + } + + if(is_fixed_point) + { + vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8)); + } + else + { + vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8)); + } + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t in_value = *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + + if(has_a_offset) + { + in_value += (*(vector_sum_col_ptr + x) * a_offset); + } + if(has_bias) + { + in_value += *(bias_ptr + x); + } + + if(is_fixed_point) + { + // Finalize and store the result + *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast(min_bound), static_cast(max_bound)); + } + else + { + // Finalize quantization + in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]); + + // Bound and store the result + if(is_bounded_relu) + { + in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); + } + *(out_it.ptr() + x) = static_cast(std::max(-128, std::min(127, in_value))); + } + } +} + template void run_offset_contribution_output_stage(const Window &window, const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, @@ -307,10 +472,16 @@ void run_offset_contribution_output_stage(const Window &window, const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; - const int32x4_t result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift); - const uint8x16_t min_u8 = vdupq_n_u8(static_cast(output_stage.gemmlowp_min_bound)); - const uint8x16_t max_u8 = vdupq_n_u8(static_cast(output_stage.gemmlowp_max_bound)); + const int32_t multiplier = output_stage.gemmlowp_multiplier; + const int32_t shift = output_stage.gemmlowp_shift; + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); + const uint8x16_t min_u8 = vdupq_n_u8(static_cast(min_bound)); + const uint8x16_t max_u8 = vdupq_n_u8(static_cast(max_bound)); const int window_step_x = 16; const auto window_start_x = static_cast(window.x().start()); @@ -349,7 +520,8 @@ void run_offset_contribution_output_stage(const Window &window, run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -363,7 +535,8 @@ void run_offset_contribution_output_stage(const Window &window, + id.y() + (id.z() % depth_input) * height_input; run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); } @@ -386,7 +559,8 @@ void run_offset_contribution_output_stage(const Window &window, + id.y() + (id.z() % depth_input) * height_input; run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -399,7 +573,8 @@ void run_offset_contribution_output_stage(const Window &window, + id.y() + (id.z() % depth_input) * height_input; run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, mm_result_it, out_it); } @@ -422,7 +597,8 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, bias_it, mm_result_it, out_it); } @@ -434,7 +610,8 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, mm_result_it, out_it); } @@ -448,7 +625,8 @@ void run_offset_contribution_output_stage(const Window &window, { run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, bias_it, mm_result_it, out_it); } @@ -458,7 +636,110 @@ void run_offset_contribution_output_stage(const Window &window, { run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + mm_result_it, out_it); + } + return; + } +} + +template +void run_offset_contribution_output_stage_symm(const Window &window, + const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, + int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, + GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); + + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); + const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int8x16_t min_s8 = vdupq_n_s8(static_cast(min_bound)); + const int8x16_t max_s8 = vdupq_n_s8(static_cast(max_bound)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); + + Iterator mm_result_it(mm_result, win); + Iterator out_it(output, win); + + if(a_offset != 0) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + vector_sum_col_it, mm_result_it, out_it); + } + } + else + { + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, mm_result_it, out_it); } @@ -470,8 +751,18 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + if(output->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 127); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < -128); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0); + } + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); if(bias != nullptr) @@ -525,7 +816,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto if(output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); } @@ -551,7 +842,7 @@ std::pair validate_and_configure_window(ITensorInfo *mm_result, } NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction -get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage) +get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage) { static std::map map_function = { @@ -562,7 +853,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, { 4, &run_offset_contribution_output_stage }, { 5, &run_offset_contribution_output_stage }, { 6, &run_offset_contribution_output_stage }, - { 7, &run_offset_contribution_output_stage } + { 7, &run_offset_contribution_output_stage_symm }, + { 8, &run_offset_contribution_output_stage_symm }, + { 9, &run_offset_contribution_output_stage_symm }, + { 10, &run_offset_contribution_output_stage_symm }, + { 11, &run_offset_contribution_output_stage_symm }, + { 12, &run_offset_contribution_output_stage_symm }, + { 13, &run_offset_contribution_output_stage_symm }, + { 14, &run_offset_contribution_output_stage_symm }, + { 15, &run_offset_contribution_output_stage_symm } }; // Check if input is a 3D reinterpretation @@ -574,11 +873,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound) && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255)); + // Check if we need to perform fixed point requantization const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; + // Check if symmetric per-channel execution + const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED; + // key acts as a bitset, setting the first bit on reinterpret_as_3d, // the second on is_bounded_relu, and the third on is_fixed_point. - uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2); + uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3); return map_function.find(key)->second; } } // namespace @@ -591,8 +894,9 @@ NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutpu } void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col, - const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) + const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, + int32_t k, int32_t a_offset, int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); @@ -627,7 +931,7 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_ ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); - _function = get_configured_function(mm_result, vector_sum_row, output_stage); + _function = get_configured_function(mm_result, vector_sum_row, output, output_stage); } Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index c1ee770db5..72632492d7 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,13 +27,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include #include #include @@ -48,7 +48,7 @@ namespace { Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); return Status{}; @@ -72,7 +72,7 @@ std::pair validate_and_configure_window_matrix_a_reduction(ITens Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); return Status{}; @@ -128,11 +128,12 @@ Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, cons return Status{}; } -void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) +template +void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t; + using TAcc = wrapper::traits::promote_t; Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); @@ -149,9 +150,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4_t sum_row = vdupq_n_u32(0); + auto sum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); + const T *matrix_a = reinterpret_cast((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); @@ -161,43 +162,41 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 4 accumulations for(; i <= (_k - 4); i += 4) { - const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4); + const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4); - // Convert U8 to U16 - uint16x4x4_t a0_u16 = + // Convert 8-bit to 16-bit + typename wrapper::traits::neon_bitvector::type a0_d16[4] = { - { - vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(a0_u8))) - } + wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))), + wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))), + wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))), + wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8))) }; - // Accumulate to U16 - a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]); - a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]); - a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]); + // Accumulate to 16-bit + a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]); + a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]); + a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]); - // Accumulate to U32 - sum_row = vaddw_u16(sum_row, a0_u16.val[0]); + // Accumulate to 32-bit + sum_row = wrapper::vaddw(sum_row, a0_d16[0]); } // This for loop performs the leftover accumulations for(; i < _k; ++i) { - const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4); + const auto a0_d8 = wrapper::vload(matrix_a + i * 4); // Convert U8 to U16 - const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8)); + const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8)); // Accumulate to U32 - sum_row = vaddw_u16(sum_row, a0_u16); + sum_row = wrapper::vaddw(sum_row, a0_d16); } auto vector_sum_row = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row)); + wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row)); }, in, out); } @@ -206,10 +205,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4_t sum_row_u32 = vdupq_n_u32(0); - uint32_t sum_row = 0; + auto vsum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + TAcc sum_row = 0; - const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); + const T *matrix_a = reinterpret_cast((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); @@ -219,37 +218,57 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 16 accumulations for(; i <= (_k - 16); i += 16) { - const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i); + const auto a0_d8 = wrapper::vloadq(matrix_a + i); // Partial accumulations in U16 - const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8)); + const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); // Accumulate to U32 - sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0)); + vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); } // This for loop performs the leftover accumulations for(; i < _k; ++i) { - sum_row += static_cast(matrix_a[i]); + sum_row += static_cast(matrix_a[i]); } #if defined(__aarch64__) // Reduction operation available on 64 bit architectures only - sum_row += vaddvq_u32(sum_row_u32); + sum_row += wrapper::vaddv(vsum_row); #else // __aarch64__ - uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32)); - tmp = vpadd_u32(tmp, tmp); + auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); + tmp = wrapper::vpadd(tmp, tmp); - sum_row += vget_lane_u32(tmp, 0); + sum_row += wrapper::vgetlane(tmp, 0); #endif // __aarch64__ - *(reinterpret_cast(out.ptr())) = static_cast(sum_row); + *(reinterpret_cast(out.ptr())) = static_cast(sum_row); }, in, out); } } +void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_input->info()->data_type()) + { + case DataType::QASYMM8: + run_internal(window); + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + run_internal(window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } +} + void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) { ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); @@ -276,11 +295,12 @@ Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, cons return Status{}; } -void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) +template +void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t; + using TAcc = wrapper::traits::promote_t; Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); @@ -297,17 +317,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4x4_t sum_col = + typename wrapper::traits::neon_bitvector::type sum_col[4] = { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) }; - const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]; + const auto *matrix_b = reinterpret_cast(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); @@ -316,35 +334,28 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf int i = 0; for(; i < _k; ++i) { - const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16); + const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16); - // Convert S8 to U16 - const uint16x8x2_t b0_u16 = + // Convert 8bit to 16bit + const typename wrapper::traits::neon_bitvector::type b0_b16[2] = { - { - vmovl_u8(vget_low_u8(b0_u8)), - vmovl_u8(vget_high_u8(b0_u8)) - } + wrapper::vmovl(wrapper::vgetlow(b0_b8)), + wrapper::vmovl(wrapper::vgethigh(b0_b8)) }; // Accumulate to U32 - sum_col = - { - { - vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])), - vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1])) - } - }; + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); } auto vector_sum_col = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0])); - vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1])); - vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2])); - vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3])); + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3])); }, in, out); } @@ -377,17 +388,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf } // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4x4_t sum_col = + typename wrapper::traits::neon_bitvector::type sum_col[4] = { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) }; - const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]; + const auto *matrix_b = reinterpret_cast(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); @@ -398,10 +407,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 4 accumulations for(; i <= (_k - 4); i += 4) { - const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride); - const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride); - const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride); - const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride); + const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); + const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); + const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); #if __arm__ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); @@ -410,34 +419,27 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); #endif /* __arm__ */ - // Partial accumulation in u16 - uint16x8x2_t tmp_sum = + // Partial accumulation in 16bit + typename wrapper::traits::neon_bitvector::type tmp_sum[2] = { - { - vdupq_n_u16(0), - vdupq_n_u16(0) - } + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) }; - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8)); - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8)); - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8)); - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8)); - - // Accumulate to U32 - sum_col = - { - { - vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])), - vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])), - vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])), - vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1])) - } - }; + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); matrix_b += 4 * in_b_stride; } @@ -445,38 +447,51 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf // This for loop perfoms the leftover accumulations for(; i < _k; ++i) { - const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride); + const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); // Convert S8 to S16 - const uint16x8x2_t b0_u16 = + const typename wrapper::traits::neon_bitvector::type b0_b16[2] { - { - vmovl_u8(vget_low_u8(b0_u8)), - vmovl_u8(vget_high_u8(b0_u8)) - } + wrapper::vmovl(wrapper::vgetlow(b0_b8)), + wrapper::vmovl(wrapper::vgethigh(b0_b8)) }; - // Accumulate to U32 - sum_col = - { - { - vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])), - vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1])) - } - }; + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); matrix_b += in_b_stride; } auto vector_sum_col = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0])); - vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1])); - vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2])); - vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3])); + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3])); }, inb, out); } } + +void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_input->info()->data_type()) + { + case DataType::QASYMM8: + run_internal(window, info); + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + run_internal(window, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } +} diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 0ca7fd3dc8..ea3d32e628 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -55,7 +55,7 @@ TensorShape get_output_shape(const ITensorInfo *input) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp index 624833adfb..649316442e 100644 --- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp @@ -49,7 +49,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); if(biases != nullptr) diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index fa335d757b..6d276d1322 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -162,6 +162,7 @@ const std::string &arm_compute::string_from_data_type(DataType dt) { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" }, { DataType::QASYMM8_PER_CHANNEL, "QASYMM8_PER_CHANNEL" }, { DataType::QASYMM8, "QASYMM8" }, + { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" }, { DataType::QSYMM16, "QSYMM16" }, { DataType::QASYMM16, "QASYMM16" }, }; @@ -292,6 +293,7 @@ std::string arm_compute::string_from_pixel_value(const PixelValue &value, const converted_string = ss.str(); break; case DataType::S8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: // Needs conversion to 32 bit, otherwise interpreted as ASCII values ss << int32_t(value.get()); @@ -448,6 +450,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const print_consecutive_elements_impl(s, ptr, n, stream_width, element_delim); break; case DataType::S8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); break; @@ -485,6 +488,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp case DataType::QASYMM8_PER_CHANNEL: return max_consecutive_elements_display_width_impl(s, ptr, n); case DataType::S8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); case DataType::U16: diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index cdd48972eb..386d75eca2 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -108,6 +108,44 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, return Status{}; } + +arm_compute::Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info, + const QuantizationInfo &wq_info, + const QuantizationInfo &oq_info, + GEMMLowpOutputStageInfo &stage_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty()); + ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty()); + ARM_COMPUTE_RETURN_ERROR_ON(oq_info.scale().empty()); + + const unsigned int size = wq_info.scale().size(); + + auto &quant_multipliers = stage_info.gemmlowp_multipliers; + auto &quant_shifts = stage_info.gemmlowp_shifts; + quant_multipliers.resize(size); + quant_shifts.resize(size); + + const auto &w_scales = wq_info.scale(); + const float i_scale = iq_info.scale().at(0); + const float o_scale = oq_info.scale().at(0); + + for(unsigned int i = 0; i < size; ++i) + { + const float multiplier = i_scale * w_scales[i] / o_scale; + int quant_multiplier = 0; + int quant_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier_less_than_one(multiplier, &quant_multiplier, &quant_shift)); + quant_multipliers[i] = quant_multiplier; + quant_shifts[i] = quant_shift; + } + + // Legacy part + stage_info.gemmlowp_shift = quant_shifts[0]; + stage_info.gemmlowp_multiplier = quant_multipliers[0]; + + return Status{}; +} + std::pair get_min_max_values_from_quantized_data_type(DataType data_type) { int min_quant_val = 0; diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index b31ecb91e9..43e531579a 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -450,13 +450,24 @@ Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo #ifndef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64"); #endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::F16, DataType::F32); + if(is_data_type_quantized_per_channel(b->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + } ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8_SIGNED && d->data_type() != DataType::S32, "Only S32 output supported for QASYMM8_SIGNED input"); return Status{}; } @@ -495,6 +506,7 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const } break; case DataType::S8: + case DataType::QASYMM8_SIGNED: create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); break; #endif /* __aarch64__ */ diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index f4377cdaf2..caff117e09 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -59,7 +59,7 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); if(biases != nullptr) @@ -114,18 +114,18 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset - const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform(); - - input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset)); - weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset)); - - const UniformQuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info().uniform(); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier; - int output_shift; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + const QuantizationInfo iqinfo = input->info()->quantization_info(); + const QuantizationInfo wqinfo = weights->info()->quantization_info(); + const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info(); + const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + + input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); + if(!is_data_type_quantized_per_channel(weights->info()->data_type())) + { + const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); + weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); + } // Merge activation with output stage int min_activation = 0; @@ -133,26 +133,25 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w if(supported_acts.count(act_info.activation()) != 0) { - const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo); - const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo); + const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo); + const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo); - min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int; + min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int; max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; } GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = oqinfo.offset; - output_info.gemmlowp_multiplier = output_multiplier; - output_info.gemmlowp_shift = output_shift; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info)); // Revert back QuantizatioInfo as input and weights could be used in other convolution layers - input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset)); - weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, wqinfo.offset)); + input->info()->set_quantization_info(iqinfo); + weights->info()->set_quantization_info(wqinfo); } else { @@ -176,20 +175,10 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset - const UniformQuantizationInfo iqinfo = input->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - - std::unique_ptr input_qa = input->clone(); - std::unique_ptr weights_qa = weights->clone(); - input_qa->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset)); - weights_qa->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset)); - - const UniformQuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info().uniform(); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier; - int output_shift; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift)); + const QuantizationInfo &iqinfo = input->quantization_info(); + const QuantizationInfo &wqinfo = weights->quantization_info(); + const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); // Merge activation with output stage int min_activation = 0; @@ -201,22 +190,25 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens }; if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) { - const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo); - const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo); + const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo); + const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo); - min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int; + min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int; max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; } GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = oqinfo.offset; - output_info.gemmlowp_multiplier = output_multiplier; - output_info.gemmlowp_shift = output_shift; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info)); // Perform validation step on GEMMLowp + std::unique_ptr input_qa = input->clone(); + std::unique_ptr weights_qa = weights->clone(); + input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); + weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info)); } else @@ -396,7 +388,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON"); diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp index 63f330be6c..a478fdd231 100644 --- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp +++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -26,11 +26,12 @@ #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "support/ToolchainSupport.h" -using namespace arm_compute; - +namespace arm_compute +{ void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output) { auto k = arm_compute::support::cpp14::make_unique(); k->configure(input, output); _kernel = std::move(k); } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 617d66cf24..01a99f7aca 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -42,9 +42,9 @@ using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), - _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), - _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), - _fuse_output_stage(false), _run_activation(false) + _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), + _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), + _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false) { } @@ -56,6 +56,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *matrix_a = a; const ITensor *matrix_b = b; + GEMMInfo info = gemm_info; // Clear state _mtx_a_reshape_kernel = nullptr; @@ -65,13 +66,41 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _a_offset = a->info()->quantization_info().uniform().offset; _b_offset = b->info()->quantization_info().uniform().offset; _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); _is_prepared = false; _fused_assembly_path = false; + _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; _original_b = b; + const ITensor *a_to_use = a; + + // Convert to QASYMM8 -> QASYMM8_SIGNED and back + if(_flip_signedness) + { + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform(); + + _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction))); + _memory_group.manage(&_signed_a); + _convert_to_signed_asymm.configure(a_to_use, &_signed_a); + a_to_use = &_signed_a; + _a_offset = _signed_a.info()->quantization_info().uniform().offset; + + const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform(); + _memory_group.manage(&_signed_output); + _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction))); + + // Output stage correction + GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); + output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset; + output_stage_corr.gemmlowp_min_bound -= offset_correction; + output_stage_corr.gemmlowp_max_bound -= offset_correction; + info.set_gemmlowp_output_stage(output_stage_corr); + } + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { _fuse_output_stage = true; _memory_group.manage(&_mm_result_s32); @@ -83,17 +112,18 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, switch(a->info()->data_type()) { case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: case DataType::U8: case DataType::S8: { - if(a->info()->data_type() == DataType::QASYMM8 && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - _asm_glue.configure(a, b, c, output, gemm_info); + _asm_glue.configure(a_to_use, b, c, output, gemm_info); _fused_assembly_path = _asm_glue.is_configured(); } else { - _asm_glue.configure(a, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info); + _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info); } _assembly_path = _asm_glue.is_configured(); break; @@ -111,7 +141,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, matrix_b = &_tmp_b; // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info()); + TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info()); _tmp_a.allocator()->init(a_info); @@ -125,7 +155,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, // Configure interleave kernel { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a, &_tmp_a); + k->configure(a_to_use, &_tmp_a); _mtx_a_reshape_kernel = std::move(k); } @@ -151,19 +181,19 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false); + _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 if(_b_offset != 0) { - TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32); + TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); _vector_sum_row.allocator()->init(info_vector_sum_row); _memory_group.manage(&_vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false); + _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false); } if(_fuse_output_stage) @@ -176,8 +206,17 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _mm_kernel = std::move(k); } - _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0), - _a_offset, _b_offset, gemm_info.gemmlowp_output_stage()); + _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, + _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, + _flip_signedness ? &_signed_output : output, + a->info()->dimension(0), + _a_offset, _b_offset, info.gemmlowp_output_stage()); + + if(_flip_signedness) + { + _convert_from_signed_asymm.configure(&_signed_output, output); + } } else { @@ -189,7 +228,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _mm_kernel = std::move(k); } // Configure offset contribution kernel - _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset); + _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset); } } @@ -228,22 +267,31 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, { _mm_result_s32.allocator()->allocate(); } + + if(_flip_signedness) + { + _signed_a.allocator()->allocate(); + _signed_output.allocator()->allocate(); + } } Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + GEMMInfo info = gemm_info; const ITensorInfo *matrix_a_info = a; const ITensorInfo *matrix_b_info = b; + const ITensorInfo *a_to_use = a; + TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; TensorInfo mm_result_s32_info{}; @@ -251,31 +299,57 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso int32_t a_offset = a->quantization_info().uniform().offset; int32_t b_offset = b->quantization_info().uniform().offset; - bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; + bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; if(fuse_output_stage) { auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); } + // Convert QASYMM8->QASYMM8_SIGNED + TensorInfo signed_a{}; + TensorInfo signed_output{}; + bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); + if(flip_signedness) + { + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + + signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); + a_to_use = &signed_a; + a_offset = signed_a.quantization_info().uniform().offset; + + const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); + signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + + // Output stage correction + GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); + output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset; + output_stage_corr.gemmlowp_min_bound -= offset_correction; + output_stage_corr.gemmlowp_max_bound -= offset_correction; + info.set_gemmlowp_output_stage(output_stage_corr); + } + // Check if we need to run the optimized assembly kernel bool run_optimised = false; bool run_optimised_requantized = false; - if(is_data_type_quantized_asymmetric(a->data_type())) + if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, gemm_info)); + run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info)); run_optimised_requantized = run_optimised; } else { - run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info)); + run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info)); } if(run_optimised) { ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(gemm_info.depth_output_gemm3d() != 0) + if(info.depth_output_gemm3d() != 0) { - if(gemm_info.reinterpret_input_as_3d()) + if(info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); @@ -292,8 +366,8 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); const bool run_vector_matrix_multiplication = a->dimension(1) < 2; if(!run_vector_matrix_multiplication) @@ -312,10 +386,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a)); + auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); } } @@ -340,7 +414,7 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false)); } if(fuse_output_stage) @@ -354,8 +428,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, - c, output, a_offset, b_offset, - gemm_info.gemmlowp_output_stage())); + c, + flip_signedness ? &signed_output : output, + a_offset, b_offset, + info.gemmlowp_output_stage())); } else { @@ -397,6 +473,12 @@ void NEGEMMLowpMatrixMultiplyCore::run() NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); } + // Convert QASYMM8->QASYMM8_SIGNED + if(_flip_signedness) + { + NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY); + } + // Run GEMM if(_asm_glue.is_configured()) { @@ -433,6 +515,12 @@ void NEGEMMLowpMatrixMultiplyCore::run() } } + // Convert QASYMM8_SIGNED->QASYMM8 + if(_flip_signedness) + { + NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY); + } + // Run fused activation if(_run_activation) { diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h index f535f16303..280f6ddbd0 100644 --- a/tests/AssetsLibrary.h +++ b/tests/AssetsLibrary.h @@ -640,6 +640,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t } case DataType::S8: case DataType::QSYMM8: + case DataType::QASYMM8_SIGNED: { std::uniform_int_distribution distribution_s8(std::numeric_limits::lowest(), std::numeric_limits::max()); fill(tensor, distribution_s8, seed_offset); diff --git a/tests/Utils.h b/tests/Utils.h index 0c4aeb61f5..6b3935e526 100644 --- a/tests/Utils.h +++ b/tests/Utils.h @@ -359,6 +359,7 @@ void store_value_with_data_type(void *ptr, T value, DataType data_type) *reinterpret_cast(ptr) = value; break; case DataType::S8: + case DataType::QASYMM8_SIGNED: case DataType::QSYMM8: case DataType::QSYMM8_PER_CHANNEL: *reinterpret_cast(ptr) = value; diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp index 4158793295..95a5548628 100644 --- a/tests/validation/Helpers.cpp +++ b/tests/validation/Helpers.cpp @@ -326,6 +326,24 @@ std::pair get_quantized_bounds(const QuantizationInfo &quant_info, flo return std::pair { min_bound, max_bound }; } +std::pair get_symm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id) +{ + ARM_COMPUTE_ERROR_ON_MSG(min > max, "min must be lower equal than max"); + + const int min_bound = quantize_qsymm8_per_channel(min, quant_info, channel_id); + const int max_bound = quantize_qsymm8_per_channel(max, quant_info, channel_id); + return std::pair { min_bound, max_bound }; +} + +std::pair get_asymm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id) +{ + ARM_COMPUTE_ERROR_ON_MSG(min > max, "min must be lower equal than max"); + + const int min_bound = quantize_qasymm8_per_channel(min, quant_info, channel_id); + const int max_bound = quantize_qasymm8_per_channel(max, quant_info, channel_id); + return std::pair { min_bound, max_bound }; +} + template void get_tile(const SimpleTensor &in, SimpleTensor &roi, const Coordinates &coord); template void get_tile(const SimpleTensor &in, SimpleTensor &roi, const Coordinates &coord); template void get_tile(const SimpleTensor &in, SimpleTensor &roi, const Coordinates &coord); diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h index 2ee2dc7aab..2c1df39f14 100644 --- a/tests/validation/Helpers.h +++ b/tests/validation/Helpers.h @@ -276,6 +276,24 @@ void zeros(SimpleTensor &in, const Coordinates &anchor, const TensorShape &sh * @param[in] max Floating point maximum value to be quantized */ std::pair get_quantized_bounds(const QuantizationInfo &quant_info, float min, float max); + +/** Helper function to compute symmetric quantized min and max bounds + * + * @param[in] quant_info Quantization info to be used for conversion + * @param[in] min Floating point minimum value to be quantized + * @param[in] max Floating point maximum value to be quantized + * @param[in] channel_id Channel id for per channel quantization info. + */ +std::pair get_symm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id = 0); + +/** Helper function to compute asymmetric quantized min and max bounds + * + * @param[in] quant_info Quantization info to be used for conversion + * @param[in] min Floating point minimum value to be quantized + * @param[in] max Floating point maximum value to be quantized + * @param[in] channel_id Channel id for per channel quantization info. + */ +std::pair get_asymm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id = 0); } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index ceecd58058..df52d8065b 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -74,6 +74,13 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f) }); + +const auto QuantizationData = framework::dataset::make("QuantizationInfo", +{ + QuantizationInfo(0.5f, 10), + QuantizationInfo(0.3f, 3), + QuantizationInfo(1.f, 10), +}); } // namespace TEST_SUITE(NEON) @@ -422,6 +429,9 @@ TEST_SUITE_END() // Float template using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture; +template +using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture; + const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), @@ -451,6 +461,33 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerQuantizedFixture validate(Accessor(_target), _reference, tolerance_qasymm8); } TEST_SUITE_END() // QASYMM8 + +TEST_SUITE(QSYMM8_PER_CHANNEL) +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerReducedDataset(), + framework::dataset::make("ReshapeWeights", { true })), + framework::dataset::make("DataType", { DataType::QASYMM8 })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + QuantizationData), + ActivationFunctionsDataset), + framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()), + framework::dataset::make("ReshapeWeights", { true })), + framework::dataset::make("DataType", { DataType::QASYMM8 })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + QuantizationData), + QuantizedActivationFunctionsDataset), + framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QSYMM8_PER_CHANNEL TEST_SUITE_END() // Quantized TEST_SUITE_END() // GEMMConvolutionLayer diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h index 52fa8da60b..c5cddc28db 100644 --- a/tests/validation/fixtures/ConvolutionLayerFixture.h +++ b/tests/validation/fixtures/ConvolutionLayerFixture.h @@ -48,7 +48,7 @@ namespace test { namespace validation { -template +template class ConvolutionValidationGenericFixture : public framework::Fixture { public: @@ -57,13 +57,15 @@ public: public: template void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, - DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info) + DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info) { - _data_type = data_type; - _is_quantized = is_data_type_quantized_asymmetric(data_type); - _bias_data_type = _is_quantized ? DataType::S32 : data_type; - _quantization_info = quantization_info; - _data_layout = data_layout; + _data_type = data_type; + _weights_data_type = weights_data_type; + _is_quantized = is_data_type_quantized_asymmetric(data_type); + _bias_data_type = _is_quantized ? DataType::S32 : data_type; + _quantization_info = quantization_info; + _weight_quantization_info = weight_quantization_info; + _data_layout = data_layout; _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info); _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info); @@ -82,6 +84,26 @@ protected: library->fill(tensor, distribution, i); break; } + case DataType::QSYMM8_PER_CHANNEL: + { + int min_bound = 128; + int max_bound = -127; + for(size_t i = 0; i < _weight_quantization_info.scale().size(); i++) + { + std::pair bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f, i); + if(bounds.first < min_bound) + { + min_bound = bounds.first; + } + if(bounds.second > max_bound) + { + max_bound = bounds.second; + } + } + std::uniform_int_distribution distribution(min_bound, max_bound); + library->fill(tensor, distribution, i); + break; + } case DataType::S32: { std::uniform_int_distribution distribution(-100, 100); @@ -122,7 +144,7 @@ protected: // Create tensors TensorType src = create_tensor(input_shape, _data_type, 1, _quantization_info, _data_layout); - TensorType weights = create_tensor(reshaped_weights_shape, _data_type, 1, _quantization_info, _data_layout); + TensorType weights = create_tensor(reshaped_weights_shape, _weights_data_type, 1, _weight_quantization_info, _data_layout); TensorType bias = create_tensor(bias_shape, _bias_data_type, 1, _quantization_info, _data_layout); TensorType dst = create_tensor(output_shape, _data_type, 1, _quantization_info, _data_layout); @@ -166,7 +188,7 @@ protected: // Create reference SimpleTensor src{ input_shape, _data_type, 1, _quantization_info }; - SimpleTensor weights{ weights_shape, _data_type, 1, _quantization_info }; + SimpleTensor weights{ weights_shape, _weights_data_type, 1, _weight_quantization_info }; SimpleTensor bias{ bias_shape, _bias_data_type, 1, _quantization_info }; // Fill reference @@ -182,36 +204,59 @@ protected: TensorType _target{}; SimpleTensor _reference{}; DataType _data_type{}; + DataType _weights_data_type{}; DataType _bias_data_type{}; DataLayout _data_layout{}; QuantizationInfo _quantization_info{}; + QuantizationInfo _weight_quantization_info{}; bool _is_quantized = false; }; template -class ConvolutionValidationFixture : public ConvolutionValidationGenericFixture +class ConvolutionValidationFixture : public ConvolutionValidationGenericFixture { public: template void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type, DataLayout data_layout, ActivationLayerInfo act_info) { - ConvolutionValidationGenericFixture::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, - data_type, data_layout, - QuantizationInfo(), act_info); + ConvolutionValidationGenericFixture::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, + data_type, data_type, data_layout, + QuantizationInfo(), QuantizationInfo(), act_info); } }; template -class ConvolutionValidationQuantizedFixture : public ConvolutionValidationGenericFixture +class ConvolutionValidationQuantizedFixture : public ConvolutionValidationGenericFixture { public: template void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info) { - ConvolutionValidationGenericFixture::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, - data_type, data_layout, quantization_info, act_info); + ConvolutionValidationGenericFixture::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, + data_type, data_type, data_layout, quantization_info, quantization_info, act_info); + } +}; + +template +class ConvolutionValidationQuantizedPerChannelFixture : public ConvolutionValidationGenericFixture +{ +public: + template + void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type, + DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataType weights_data_type) + { + std::vector weights_scales{}; + std::mt19937 gen(library->seed()); + std::uniform_real_distribution<> dis(0.01f, 1); + for(size_t i = 0; i < output_shape[2]; ++i) + { + weights_scales.push_back(dis(gen)); + } + ConvolutionValidationGenericFixture::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, + reshape_weights, data_type, weights_data_type, data_layout, + quantization_info, QuantizationInfo(weights_scales), act_info); } }; } // namespace validation diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h index 30be25f504..23918a4055 100644 --- a/tests/validation/reference/Convolution3d.h +++ b/tests/validation/reference/Convolution3d.h @@ -42,13 +42,16 @@ inline bool is_valid_pixel(int i, int min, int max) } // 3D convolution for floating point type -template < typename T, typename TB, typename std::enable_if < validation::is_floating_point::value &&validation::is_floating_point::value, int >::type = 0 > -inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &out, +template < typename T, typename TW, typename TB, typename std::enable_if < validation::is_floating_point::value &&validation::is_floating_point::value + &&validation::is_floating_point::value, + int >::type = 0 > +inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &out, int i_offset, int w_offset, int b_offset, int o_offset, - int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1) + int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1, int filter_id = 0) { + ARM_COMPUTE_UNUSED(filter_id); const T *in_ptr = in.data() + i_offset; - const T *w_ptr = weights.data() + w_offset; + const TW *w_ptr = weights.data() + w_offset; const TB *b_ptr = bias.data() + b_offset; T *out_ptr = out.data() + o_offset; @@ -77,8 +80,8 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weig const int idx = xk + half_width_weights_start; const int idy = yk + half_height_weights_start; - const T i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in]; - const T w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights]; + const T i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in]; + const TW w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights]; acc += i_value * w_value; } @@ -91,13 +94,16 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weig } // 3D convolution for QASYMM8 type -template < typename T, typename TB, typename std::enable_if < std::is_same::value &&std::is_same::value, int >::type = 0 > -inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &out, +template < typename T, typename TW, typename TB, typename std::enable_if < std::is_same::value &&(std::is_same::value + || std::is_same::value) + &&std::is_same::value, + int >::type = 0 > +inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &out, int i_offset, int w_offset, int b_offset, int o_offset, - int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1) + int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1, int filter_id = 0) { const T *in_ptr = in.data() + i_offset; - const T *w_ptr = weights.data() + w_offset; + const TW *w_ptr = weights.data() + w_offset; const TB *b_ptr = bias.data() + b_offset; T *out_ptr = out.data() + o_offset; @@ -107,10 +113,22 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weig const int input_offset = -iq_info.offset; const float input_scale = iq_info.scale; - const int weights_offset = -wq_info.offset; - const float weights_scale = wq_info.scale; - const int output_offset = oq_info.offset; - const float output_scale = oq_info.scale; + int weights_offset = -wq_info.offset; + float weights_scale = wq_info.scale; + if(is_data_type_quantized_per_channel(weights.data_type())) + { + if(is_data_type_quantized_asymmetric(weights.data_type())) + { + weights_offset = weights.quantization_info().offset()[filter_id]; + } + else + { + weights_offset = 0; + } + weights_scale = weights.quantization_info().scale()[filter_id]; + } + const int output_offset = oq_info.offset; + const float output_scale = oq_info.scale; int output_multiplier = 0; int output_shift = 0; @@ -142,9 +160,8 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weig const int idx = xk + half_width_weights_start; const int idy = yk + half_height_weights_start; - const uint8_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in]; - const uint8_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights]; - + const int32_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in]; + const int32_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights]; acc += (i_value + input_offset) * (w_value + weights_offset); } } diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp index 69090117fe..4d2c1acb6f 100644 --- a/tests/validation/reference/ConvolutionLayer.cpp +++ b/tests/validation/reference/ConvolutionLayer.cpp @@ -45,8 +45,8 @@ namespace { } // namespace -template -SimpleTensor convolution_layer_nchw(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &dst, const PadStrideInfo &info, +template +SimpleTensor convolution_layer_nchw(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &dst, const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON((src.shape()[2] / num_groups) != weights.shape()[2]); @@ -73,7 +73,6 @@ SimpleTensor convolution_layer_nchw(const SimpleTensor &src, const SimpleT const int end_xi = output_wh.first * stride_xi; const int end_yi = output_wh.second * stride_yi; const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in); - for(int r = 0; r < num_batches; ++r) { for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi) @@ -100,17 +99,16 @@ SimpleTensor convolution_layer_nchw(const SimpleTensor &src, const SimpleT offset_in, offset_w, offset_b, offset_out, xi, yi, width_in, height_in, (depth_in / num_groups), - width_weights, height_weights, dilation.x(), dilation.y()); + width_weights, height_weights, dilation.x(), dilation.y(), ofm); } } } } } - return dst; } -template -SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, const PadStrideInfo &info, +template +SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info) { // if no explicit quantization has been set you the same as src @@ -123,9 +121,9 @@ SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor if(src.data_layout() == DataLayout::NHWC) { - SimpleTensor src_nchw = reference::permute(src, PermutationVector(1U, 2U, 0U)); - SimpleTensor weights_nchw = reference::permute(weights, PermutationVector(1U, 2U, 0U)); - SimpleTensor dst_nchw = reference::permute(dst, PermutationVector(1U, 2U, 0U)); + SimpleTensor src_nchw = reference::permute(src, PermutationVector(1U, 2U, 0U)); + SimpleTensor weights_nchw = reference::permute(weights, PermutationVector(1U, 2U, 0U)); + SimpleTensor dst_nchw = reference::permute(dst, PermutationVector(1U, 2U, 0U)); return reference::permute(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation, num_groups), PermutationVector(2U, 0U, 1U)); } @@ -141,6 +139,8 @@ template SimpleTensor convolution_layer(const SimpleTensor &src, con const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info); template SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info); +template SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, + const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h index c51a9b3ad7..8f41073fe2 100644 --- a/tests/validation/reference/ConvolutionLayer.h +++ b/tests/validation/reference/ConvolutionLayer.h @@ -35,8 +35,8 @@ namespace validation { namespace reference { -template -SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, const PadStrideInfo &info, +template +SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, const PadStrideInfo &info, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, QuantizationInfo out_quant_info = QuantizationInfo()); } // namespace reference } // namespace validation -- cgit v1.2.1