From 70d43a3671090d7ab104909a9433c88e02593038 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Mon, 22 Jun 2020 17:05:43 +0100 Subject: COMPMID-3538: Remove templates from NEGEMMLowpOffsetContributionOutputStageKernel This change reduces the core's library size by 191Kb. Change-Id: Ifb8eb0d7f8bc7713f2368803a62a4c9277cc5c87 Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3439 Reviewed-by: Michele Di Giorgio Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/NEON/NEAsymm.h | 91 +++---- ...NEGEMMLowpOffsetContributionOutputStageKernel.h | 26 +- .../NEDirectConvolutionLayerOutputStageKernel.cpp | 4 +- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 285 +++++++++------------ ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp | 14 +- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 10 +- 6 files changed, 190 insertions(+), 240 deletions(-) diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index e4f4250d16..8558706c4d 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -67,24 +67,23 @@ int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4 /** Performs final quantization step on 16 elements * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param in_s32 Input to be quantized. - * @param result_fixedpoint_multiplier Result multiplier parameter - * @param result_shift Result shift parameter - * @param result_offset_after_shift_s32 Result offset parameter - * @param min_u8 Relu lower bound - * @param max_u8 Relu upper bound + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_u8 Relu lower bound + * @param[in] max_u8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized values */ -template -uint8x16_t finalize_quantization(int32x4x4_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int32x4_t result_offset_after_shift_s32, - uint8x16_t min_u8, - uint8x16_t max_u8) +inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32x4_t result_offset_after_shift_s32, + uint8x16_t min_u8, + uint8x16_t max_u8, + bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -150,24 +149,23 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32, /** Performs final quantization step on 16 elements * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param in_s32 Input to be quantized. - * @param result_fixedpoint_multiplier Result multiplier parameter - * @param result_shift Result shift parameter - * @param result_offset_after_shift_s32 Result offset parameter - * @param min_s8 Relu lower bound - * @param max_s8 Relu upper bound + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized values */ -template -int8x16_t finalize_quantization(int32x4x4_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int32x4_t result_offset_after_shift_s32, - int8x16_t min_s8, - int8x16_t max_s8) +inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32x4_t result_offset_after_shift_s32, + int8x16_t min_s8, + int8x16_t max_s8, + bool is_bounded_relu) { if(result_shift < 0) { @@ -225,24 +223,23 @@ int8x16_t finalize_quantization(int32x4x4_t &in_s32, /** Performs final quantization step on 16 elements for symmetric quantization * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param in_s32 Input to be quantized. - * @param result_fixedpoint_multiplier Result multiplier parameter - * @param result_shift Result shift parameter - * @param result_offset_after_shift_s32 Result offset parameter - * @param min_s8 Relu lower bound - * @param max_s8 Relu upper bound + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized values */ -template inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, const int32x4x4_t &result_fixedpoint_multiplier, const int32x4x4_t &result_shift, const int32x4_t &result_offset_after_shift_s32, const int8x16_t &min_s8, - const int8x16_t &max_s8) + const int8x16_t &max_s8, + const bool is_bounded_relu) { const static int32x4_t one_s32 = vdupq_n_s32(1); @@ -321,8 +318,6 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, } /** Performs final quantization step on single element - * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied * * @param[in] in_value Input to be quantized. * @param[in] result_fixedpoint_multiplier Result multiplier parameter @@ -330,13 +325,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, * @param[in] result_offset_after_shift_s32 Result offset parameter * @param[in] min_u8 Relu lower bound * @param[in] max_u8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized value */ -template inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift_s32, - uint8_t min_u8, uint8_t max_u8) + uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); @@ -366,8 +361,6 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul } /** Performs final quantization step on single element - * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied * * @param[in] in_value Input to be quantized. * @param[in] result_fixedpoint_multiplier Result multiplier parameter @@ -375,13 +368,13 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul * @param[in] result_offset_after_shift_s32 Result offset parameter * @param[in] min_s8 Relu lower bound * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied * * @return Quantized value */ -template inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift_s32, - int8_t min_s8, int8_t max_s8) + int8_t min_s8, int8_t max_s8, bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h index 0dc64c9842..203b26e422 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -115,22 +115,18 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - using NEGEMMLowpOffsetContributionOutputStageFunction = std::function; - private: /** Function to use for the particular tensors passed to configure() */ - NEGEMMLowpOffsetContributionOutputStageFunction _function; - const ITensor *_vector_sum_col; - const ITensor *_vector_sum_row; - const ITensor *_bias; - const ITensor *_mm_result; - ITensor *_output; - int32_t _a_offset; - int32_t _b_offset; - int32_t _k_offset; - bool _slide_vector_sum_col; - GEMMLowpOutputStageInfo _output_stage; + const ITensor *_vector_sum_col; + const ITensor *_vector_sum_row; + const ITensor *_bias; + const ITensor *_mm_result; + ITensor *_output; + int32_t _a_offset; + int32_t _b_offset; + int32_t _k_offset; + bool _slide_vector_sum_col; + GEMMLowpOutputStageInfo _output_stage; }; } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 2f106a3f79..c016e2836c 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -275,7 +275,7 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window } const auto out_ptr = reinterpret_cast(out.ptr()); - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max)); + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false)); }, in, out); } @@ -326,7 +326,7 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window } const auto out_ptr = reinterpret_cast(out.ptr()); - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max)); + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false)); }, in, bi, out); } diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index 31414e3f3f..019a204196 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -155,8 +155,7 @@ inline int32x4x4_t get_k_offset(int32_t k_offset) }; } -template -inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8) +inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -193,8 +192,7 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3 return out_u8; } -template -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8) +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -231,8 +229,7 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 return out_s8; } -template -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8) +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -307,13 +304,13 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias) return bias_it; } -template +template inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, typename VT::vtype min_vec, typename VT::vtype max_vec, int32_t a_offset, int32_t b_offset, int32_t k_offset, int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x) + int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) { int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; if(!is_fixed_point) @@ -355,12 +352,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { wrapper::vstore(reinterpret_cast(out_it.ptr() + x), - finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec)); + finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); } else { wrapper::vstore(reinterpret_cast(out_it.ptr() + x), - finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec)); + finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); } } // Compute left-over elements @@ -380,9 +377,9 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { // Finalize and store the result - *reinterpret_cast(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, - static_cast(min_bound), - static_cast(max_bound)); + *reinterpret_cast(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, + static_cast(min_bound), + static_cast(max_bound), is_bounded_relu); } else { @@ -400,12 +397,11 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su } } -template inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, const int32_t *result_multipliers, const int32_t *result_shifts, const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8, int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x) + int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) { int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; if(!is_fixed_point) @@ -435,11 +431,11 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect if(is_fixed_point) { - vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8)); + vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu)); } else { - vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8)); + vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); } } // Compute left-over elements @@ -459,7 +455,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect if(is_fixed_point) { // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast(min_bound), static_cast(max_bound)); + *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast(min_bound), static_cast(max_bound), is_bounded_relu); } else { @@ -476,11 +472,11 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect } } -template +template void run_offset_contribution_output_stage(const Window &window, const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, - GEMMLowpOutputStageInfo output_stage) + GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) { using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; using Typer = VectorTyper; @@ -533,13 +529,13 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), - mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), + mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point); }, vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -551,11 +547,11 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point); }, vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); } @@ -576,12 +572,12 @@ void run_offset_contribution_output_stage(const Window &window, const int batch_id = id.z() / depth_input; const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point); }, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -592,11 +588,11 @@ void run_offset_contribution_output_stage(const Window &window, const int batch_id = id.z() / depth_input; const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point); }, vector_sum_row_it, mm_result_it, out_it); } @@ -617,12 +613,12 @@ void run_offset_contribution_output_stage(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point); }, vector_sum_col_it, bias_it, mm_result_it, out_it); } @@ -632,11 +628,11 @@ void run_offset_contribution_output_stage(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point); }, vector_sum_col_it, mm_result_it, out_it); } @@ -648,11 +644,11 @@ void run_offset_contribution_output_stage(const Window &window, Iterator bias_it = get_bias_it(collapsed_window, bias); execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point); }, bias_it, mm_result_it, out_it); } @@ -660,11 +656,11 @@ void run_offset_contribution_output_stage(const Window &window, { execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point); }, mm_result_it, out_it); } @@ -672,11 +668,10 @@ void run_offset_contribution_output_stage(const Window &window, } } -template void run_offset_contribution_output_stage_symm(const Window &window, const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, - GEMMLowpOutputStageInfo output_stage) + GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) { ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); @@ -720,11 +715,11 @@ void run_offset_contribution_output_stage_symm(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point); }, vector_sum_col_it, bias_it, mm_result_it, out_it); } @@ -734,11 +729,11 @@ void run_offset_contribution_output_stage_symm(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); }, vector_sum_col_it, mm_result_it, out_it); } @@ -750,11 +745,11 @@ void run_offset_contribution_output_stage_symm(const Window &window, Iterator bias_it = get_bias_it(collapsed_window, bias); execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point); }, bias_it, mm_result_it, out_it); } @@ -762,11 +757,11 @@ void run_offset_contribution_output_stage_symm(const Window &window, { execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point); }, mm_result_it, out_it); } @@ -860,81 +855,10 @@ std::pair validate_and_configure_window(ITensorInfo *mm_result, return std::make_pair(Status{}, win); } - -NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction -get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage) -{ - static std::map map_function_qasymm = - { - { 0, &run_offset_contribution_output_stage }, - { 1, &run_offset_contribution_output_stage }, - { 2, &run_offset_contribution_output_stage }, - { 3, &run_offset_contribution_output_stage }, - { 4, &run_offset_contribution_output_stage }, - { 5, &run_offset_contribution_output_stage }, - { 6, &run_offset_contribution_output_stage }, - { 7, &run_offset_contribution_output_stage }, - { 8, &run_offset_contribution_output_stage }, - { 9, &run_offset_contribution_output_stage }, - { 10, &run_offset_contribution_output_stage }, - { 11, &run_offset_contribution_output_stage }, - { 12, &run_offset_contribution_output_stage }, - { 13, &run_offset_contribution_output_stage }, - { 14, &run_offset_contribution_output_stage }, - { 15, &run_offset_contribution_output_stage }, - }; - - static std::map map_function_qsymm = - { - { 0, &run_offset_contribution_output_stage_symm }, - { 1, &run_offset_contribution_output_stage_symm }, - { 2, &run_offset_contribution_output_stage_symm }, - { 3, &run_offset_contribution_output_stage_symm }, - { 4, &run_offset_contribution_output_stage_symm }, - { 5, &run_offset_contribution_output_stage_symm }, - { 6, &run_offset_contribution_output_stage_symm }, - { 7, &run_offset_contribution_output_stage_symm } - }; - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - - // Check if we need to clamp the result using min and max - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(output->info()->data_type()); - int32_t type_min_int = type_min.get(); - int32_t type_max_int = type_max.get(); - const bool is_bounded_relu = !(output_stage.gemmlowp_min_bound <= type_min_int && output_stage.gemmlowp_max_bound >= type_max_int); - - // Check if we need to perform fixed point requantization - const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; - - // Check if symmetric per-channel execution - const bool is_signed = output->info()->data_type() == DataType::QASYMM8_SIGNED; - - // Check if symmetric per-channel execution - const bool is_symm = output_stage.is_quantized_per_channel; - - // key acts as a bitset, setting the first bit on reinterpret_as_3d, - // the second on is_bounded_relu, and the third on is_fixed_point. - uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2); - if(is_symm) - { - return map_function_qsymm.find(key)->second; - } - else - { - key |= ((is_signed ? 1UL : 0UL) << 3); - return map_function_qasymm.find(key)->second; - } -} } // namespace NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel() - : _function(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true), + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true), _output_stage(GEMMLowpOutputStageInfo()) { @@ -977,8 +901,6 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_ auto win_config = validate_and_configure_window(mm_result->info(), output->info()); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); - - _function = get_configured_function(mm_result, vector_sum_row, output, output_stage); } Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, @@ -996,7 +918,46 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, co ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - _function(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage); + + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(_output->info()->data_type()); + int32_t type_min_int = type_min.get(); + int32_t type_max_int = type_max.get(); + + const bool reinterpret_as_3d = _vector_sum_row != nullptr + && _mm_result->info()->num_dimensions() > 1 + && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x(); + + const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); + + // Check if we need to perform fixed point requantization + const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; + + // Check if symmetric per-channel execution + const bool is_signed = _output->info()->data_type() == DataType::QASYMM8_SIGNED; + + // Check if symmetric per-channel execution + const bool is_symm = _output_stage.is_quantized_per_channel; + + if(is_symm) + { + run_offset_contribution_output_stage_symm(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + else + { + if(is_signed) + { + run_offset_contribution_output_stage(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + else + { + run_offset_contribution_output_stage(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp index b8ca17ec3d..9400c9704a 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp @@ -140,7 +140,7 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); vst1q_s8(reinterpret_cast(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8)); + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); } // Compute left-over elements @@ -152,8 +152,8 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window // Add bias in_value += bias_value; // Finalize and store the result - *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast(_min), static_cast(_max)); + *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast(_min), static_cast(_max), is_bounded_relu); } }, in, out, bias); @@ -177,7 +177,7 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window }; vst1q_s8(reinterpret_cast(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8)); + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); } // Compute left-over elements @@ -186,8 +186,8 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); // Finalize and store the result - *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast(_min), static_cast(_max)); + *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast(_min), static_cast(_max), is_bounded_relu); } }, in, out); @@ -242,4 +242,4 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window (this->*_func)(window); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 4a9d2f7481..78610c95a7 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -145,7 +145,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); + vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); } // Compute left-over elements @@ -157,7 +157,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window // Add bias in_value += bias_value; // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max)); + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max), is_bounded_relu); } }, in, out, bias); @@ -180,7 +180,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window } }; - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); + vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); } // Compute left-over elements @@ -189,7 +189,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max)); + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max), is_bounded_relu); } }, in, out); @@ -244,4 +244,4 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window (this->*_func)(window); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute -- cgit v1.2.1