From 70d43a3671090d7ab104909a9433c88e02593038 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Mon, 22 Jun 2020 17:05:43 +0100 Subject: COMPMID-3538: Remove templates from NEGEMMLowpOffsetContributionOutputStageKernel This change reduces the core's library size by 191Kb. Change-Id: Ifb8eb0d7f8bc7713f2368803a62a4c9277cc5c87 Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3439 Reviewed-by: Michele Di Giorgio Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/NEON/NEAsymm.h | 91 ++++++++++------------ ...NEGEMMLowpOffsetContributionOutputStageKernel.h | 26 +++---- 2 files changed, 53 insertions(+), 64 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index e4f4250d16..8558706c4d 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -67,24 +67,23 @@ int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4 /** Performs final quantization step on 16 elements * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param in_s32 Input to be quantized. - * @param result_fixedpoint_multiplier Result multiplier parameter - * @param result_shift Result shift parameter - * @param result_offset_after_shift_s32 Result offset parameter - * @param min_u8 Relu lower bound - * @param max_u8 Relu upper bound + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_u8 Relu lower bound + * @param[in] max_u8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized values */ -template -uint8x16_t finalize_quantization(int32x4x4_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int32x4_t result_offset_after_shift_s32, - uint8x16_t min_u8, - uint8x16_t max_u8) +inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32x4_t result_offset_after_shift_s32, + uint8x16_t min_u8, + uint8x16_t max_u8, + bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -150,24 +149,23 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32, /** Performs final quantization step on 16 elements * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param in_s32 Input to be quantized. - * @param result_fixedpoint_multiplier Result multiplier parameter - * @param result_shift Result shift parameter - * @param result_offset_after_shift_s32 Result offset parameter - * @param min_s8 Relu lower bound - * @param max_s8 Relu upper bound + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized values */ -template -int8x16_t finalize_quantization(int32x4x4_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int32x4_t result_offset_after_shift_s32, - int8x16_t min_s8, - int8x16_t max_s8) +inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32x4_t result_offset_after_shift_s32, + int8x16_t min_s8, + int8x16_t max_s8, + bool is_bounded_relu) { if(result_shift < 0) { @@ -225,24 +223,23 @@ int8x16_t finalize_quantization(int32x4x4_t &in_s32, /** Performs final quantization step on 16 elements for symmetric quantization * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param in_s32 Input to be quantized. - * @param result_fixedpoint_multiplier Result multiplier parameter - * @param result_shift Result shift parameter - * @param result_offset_after_shift_s32 Result offset parameter - * @param min_s8 Relu lower bound - * @param max_s8 Relu upper bound + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized values */ -template inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, const int32x4x4_t &result_fixedpoint_multiplier, const int32x4x4_t &result_shift, const int32x4_t &result_offset_after_shift_s32, const int8x16_t &min_s8, - const int8x16_t &max_s8) + const int8x16_t &max_s8, + const bool is_bounded_relu) { const static int32x4_t one_s32 = vdupq_n_s32(1); @@ -321,8 +318,6 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, } /** Performs final quantization step on single element - * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied * * @param[in] in_value Input to be quantized. * @param[in] result_fixedpoint_multiplier Result multiplier parameter @@ -330,13 +325,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, * @param[in] result_offset_after_shift_s32 Result offset parameter * @param[in] min_u8 Relu lower bound * @param[in] max_u8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized value */ -template inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift_s32, - uint8_t min_u8, uint8_t max_u8) + uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); @@ -366,8 +361,6 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul } /** Performs final quantization step on single element - * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied * * @param[in] in_value Input to be quantized. * @param[in] result_fixedpoint_multiplier Result multiplier parameter @@ -375,13 +368,13 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul * @param[in] result_offset_after_shift_s32 Result offset parameter * @param[in] min_s8 Relu lower bound * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized value */ -template inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift_s32, - int8_t min_s8, int8_t max_s8) + int8_t min_s8, int8_t max_s8, bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h index 0dc64c9842..203b26e422 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -115,22 +115,18 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - using NEGEMMLowpOffsetContributionOutputStageFunction = std::function; - private: /** Function to use for the particular tensors passed to configure() */ - NEGEMMLowpOffsetContributionOutputStageFunction _function; - const ITensor *_vector_sum_col; - const ITensor *_vector_sum_row; - const ITensor *_bias; - const ITensor *_mm_result; - ITensor *_output; - int32_t _a_offset; - int32_t _b_offset; - int32_t _k_offset; - bool _slide_vector_sum_col; - GEMMLowpOutputStageInfo _output_stage; + const ITensor *_vector_sum_col; + const ITensor *_vector_sum_row; + const ITensor *_bias; + const ITensor *_mm_result; + ITensor *_output; + int32_t _a_offset; + int32_t _b_offset; + int32_t _k_offset; + bool _slide_vector_sum_col; + GEMMLowpOutputStageInfo _output_stage; }; } // namespace arm_compute -- cgit v1.2.1