From bc415af5ee9517fd113e9ea0f01fdc84f9693dc4 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 13 Jun 2019 15:58:32 +0100 Subject: COMPMID-2406: Create a new GEMMLowpQuantizeDownInt32ToInt16ScaleKernel for NEON Change-Id: I3f3e247728fd6dafca066e41835f0ef9442d9b7a Signed-off-by: giuros01 Reviewed-on: https://review.mlplatform.org/c/1379 Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins --- arm_compute/core/NEON/NEKernels.h | 1 + arm_compute/core/NEON/NESymm.h | 106 +++++++++++++++++++ ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h | 116 +++++++++++++++++++++ .../runtime/NEON/functions/NEGEMMLowpOutputStage.h | 62 ++++++++++- 4 files changed, 283 insertions(+), 2 deletions(-) create mode 100644 arm_compute/core/NEON/NESymm.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h (limited to 'arm_compute') diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 206ff06f8a..4023d82107 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -78,6 +78,7 @@ #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h new file mode 100644 index 0000000000..0479753426 --- /dev/null +++ b/arm_compute/core/NEON/NESymm.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESYMM_H__ +#define __ARM_COMPUTE_NESYMM_H__ + +#include "NEAsymm.h" +#include + +namespace arm_compute +{ +/** Performs final quantization step on 8 signed 16-bit elements + * + * @tparam is_bounded_relu Specified if a fused bounded relu should be applied + * + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] min_s16 Relu lower bound + * @param[in] max_s16 Relu upper bound + * + * @return Quantized values + */ +template +int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int16x8_t min_s16, + int16x8_t max_s16) +{ + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + + // Round to the nearest division by a power-of-two using result_shift_s32 + in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); + in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); + + // Convert S32 to S16 + int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])); + + if(is_bounded_relu) + { + out_s16 = vmaxq_s16(out_s16, min_s16); + out_s16 = vminq_s16(out_s16, max_s16); + } + + return out_s16; +} + +/** Performs final quantization step on single signed 16-bit element + * + * @tparam is_bounded_relu Specified if a fused bounded relu should be applied + * + * @param[in] in_value Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] min_s16 Relu lower bound + * @param[in] max_s16 Relu upper bound + * + * @return Quantized values + */ +template +inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier, + int32_t result_shift, int16_t min_s16, int16_t max_s16) +{ + int32x4_t in_s32 = vdupq_n_s32(in_value); + + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + + // Shift value by result_shift_s32 + in_value = rounding_divide_by_pow2(in_value, result_shift); + + // Bound the result + int16_t out_s16 = static_cast(std::max(-32768, std::min(32767, in_value))); + + if(is_bounded_relu) + { + out_s16 = static_cast(std::max(min_s16, std::min(max_s16, out_s16))); + } + + return out_s16; +} +} // namespace arm_compute +#endif // __ARM_COMPUTE_NESYMM_H__ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..806c0e4c2f --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16. + * + */ +class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public INEKernel +{ +public: + const char *name() const override + { + return "NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel"; + } + /** Constructor */ + NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QSYMM16 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. + */ + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + * + * @param[in] input Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template + void run(const Window &window); + + /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(const Window &window); + + QuantizeDownFunctionPtr _func; + const ITensor *_input; + const ITensor *_bias; + ITensor *_output; + int _result_fixedpoint_multiplier; + int _result_shift; + int _min; + int _max; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h index 77bfb98ba6..5ece753660 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -147,5 +147,63 @@ public: */ static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); }; +/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on NEON. + * + * NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters: + * + * result_fixedpoint_multiplier, result_shift + * + * The final result is: + * + * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + * + * where FixedPointMul(x, y) is the nearest integer to the following + * mathematical expression, evaluated without overflow or intermediate rounding: + * + * (x * y) / 2^31 + * + * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 + * + * In case the bias tensor is provided, the final result is: + * + * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift + * + * This function calls the following NEON kernels: + * + * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + * + * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions + * after the result is shifted right by result_shift +*/ +class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QSYMM16 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. + */ + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint + * + * @param[in] input Input tensor info. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); +}; } // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file +#endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */ -- cgit v1.2.1