diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2019-12-02 19:01:25 +0000 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2019-12-04 12:44:28 +0000 |
commit | 6e1791b1bfabc81f08d3117939f6eb5264ed4edf (patch) | |
tree | b984d58856ef9baa168bcf878659caddf599f623 /src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp | |
parent | 5cb49dcf7ad74cc6e7e91790b7132ae4dd845515 (diff) | |
download | ComputeLibrary-6e1791b1bfabc81f08d3117939f6eb5264ed4edf.tar.gz |
COMPMID-2764: Add support for QASYMM8_SIGNED in NEConvolutionLayer.
Change-Id: I8fbbd2e399f48968337a60147098d04f27c2d1c0
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2402
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp')
-rw-r--r-- | src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp | 182 |
1 files changed, 116 insertions, 66 deletions
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index a32f0bbdae..84187332f8 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -269,6 +269,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 return out_s8; } +template <typename T> +struct VectorTyper +{ + using stype = T; + using vtype = typename wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128>; +}; + inline Window get_win_vector_sum(const Window &window) { Window win_vector_sum(window); @@ -300,9 +307,10 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias) return bias_it; } -template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point> +template <typename VT, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point> inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, + const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, + typename VT::vtype min_vec, typename VT::vtype max_vec, int32_t a_offset, int32_t b_offset, int32_t k_offset, int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, int window_step_x, int window_start_x, int window_end_x) @@ -346,11 +354,13 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { - vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8)); + wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec)); } else { - vst1q_u8(out_it.ptr() + x, finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8)); + wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_vec, max_vec)); } } // Compute left-over elements @@ -370,7 +380,9 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset, static_cast<uint8_t>(min_bound), static_cast<uint8_t>(max_bound)); + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset, + static_cast<typename VT::stype>(min_bound), + static_cast<typename VT::stype>(max_bound)); } else { @@ -380,9 +392,10 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su // Bound and store the result if(is_bounded_relu) { - in_value = static_cast<uint8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); + in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); } - *(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value))); + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()), + std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value))); } } } @@ -463,12 +476,15 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect } } -template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point> +template <typename T, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point> void run_offset_contribution_output_stage(const Window &window, const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, GEMMLowpOutputStageInfo output_stage) { + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + using Typer = VectorTyper<T>; + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; @@ -478,10 +494,10 @@ void run_offset_contribution_output_stage(const Window &window, const int32_t min_bound = output_stage.gemmlowp_min_bound; const int32_t max_bound = output_stage.gemmlowp_max_bound; - const int32x4_t result_offset_s32 = vdupq_n_s32(offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); - const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(min_bound)); - const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(max_bound)); + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); + const auto min_vec = wrapper::vdup_n(static_cast<T>(min_bound), ExactTagType{}); + const auto max_vec = wrapper::vdup_n(static_cast<T>(max_bound), ExactTagType{}); const int window_step_x = 16; const auto window_start_x = static_cast<int>(window.x().start()); @@ -517,11 +533,13 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), + mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -533,10 +551,11 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); } @@ -557,10 +576,12 @@ void run_offset_contribution_output_stage(const Window &window, const int batch_id = id.z() / depth_input; const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -571,10 +592,11 @@ void run_offset_contribution_output_stage(const Window &window, const int batch_id = id.z() / depth_input; const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, mm_result_it, out_it); } @@ -595,10 +617,12 @@ void run_offset_contribution_output_stage(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, bias_it, mm_result_it, out_it); } @@ -608,10 +632,11 @@ void run_offset_contribution_output_stage(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, mm_result_it, out_it); } @@ -623,10 +648,11 @@ void run_offset_contribution_output_stage(const Window &window, Iterator bias_it = get_bias_it(collapsed_window, bias); execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, bias_it, mm_result_it, out_it); } @@ -634,10 +660,11 @@ void run_offset_contribution_output_stage(const Window &window, { execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window<Typer, false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, mm_result_it, out_it); } @@ -844,24 +871,36 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage) { - static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function = - { - { 0, &run_offset_contribution_output_stage<false, false, false> }, - { 1, &run_offset_contribution_output_stage<true, false, false> }, - { 2, &run_offset_contribution_output_stage<false, true, false> }, - { 3, &run_offset_contribution_output_stage<true, true, false> }, - { 4, &run_offset_contribution_output_stage<false, false, true> }, - { 5, &run_offset_contribution_output_stage<true, false, true> }, - { 6, &run_offset_contribution_output_stage<false, true, true> }, - { 7, &run_offset_contribution_output_stage<true, true, true> }, - { 8, &run_offset_contribution_output_stage_symm<false, false, false> }, - { 9, &run_offset_contribution_output_stage_symm<true, false, false> }, - { 10, &run_offset_contribution_output_stage_symm<false, true, false> }, - { 11, &run_offset_contribution_output_stage_symm<true, true, false> }, - { 12, &run_offset_contribution_output_stage_symm<false, false, true> }, - { 13, &run_offset_contribution_output_stage_symm<true, false, true> }, - { 14, &run_offset_contribution_output_stage_symm<false, true, true> }, - { 15, &run_offset_contribution_output_stage_symm<true, true, true> } + static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qasymm = + { + { 0, &run_offset_contribution_output_stage<uint8_t, false, false, false> }, + { 1, &run_offset_contribution_output_stage<uint8_t, true, false, false> }, + { 2, &run_offset_contribution_output_stage<uint8_t, false, true, false> }, + { 3, &run_offset_contribution_output_stage<uint8_t, true, true, false> }, + { 4, &run_offset_contribution_output_stage<uint8_t, false, false, true> }, + { 5, &run_offset_contribution_output_stage<uint8_t, true, false, true> }, + { 6, &run_offset_contribution_output_stage<uint8_t, false, true, true> }, + { 7, &run_offset_contribution_output_stage<uint8_t, true, true, true> }, + { 8, &run_offset_contribution_output_stage<int8_t, false, false, false> }, + { 9, &run_offset_contribution_output_stage<int8_t, true, false, false> }, + { 10, &run_offset_contribution_output_stage<int8_t, false, true, false> }, + { 11, &run_offset_contribution_output_stage<int8_t, true, true, false> }, + { 12, &run_offset_contribution_output_stage<int8_t, false, false, true> }, + { 13, &run_offset_contribution_output_stage<int8_t, true, false, true> }, + { 14, &run_offset_contribution_output_stage<int8_t, false, true, true> }, + { 15, &run_offset_contribution_output_stage<int8_t, true, true, true> }, + }; + + static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qsymm = + { + { 0, &run_offset_contribution_output_stage_symm<false, false, false> }, + { 1, &run_offset_contribution_output_stage_symm<true, false, false> }, + { 2, &run_offset_contribution_output_stage_symm<false, true, false> }, + { 3, &run_offset_contribution_output_stage_symm<true, true, false> }, + { 4, &run_offset_contribution_output_stage_symm<false, false, true> }, + { 5, &run_offset_contribution_output_stage_symm<true, false, true> }, + { 6, &run_offset_contribution_output_stage_symm<false, true, true> }, + { 7, &run_offset_contribution_output_stage_symm<true, true, true> } }; // Check if input is a 3D reinterpretation @@ -877,12 +916,23 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; // Check if symmetric per-channel execution - const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED; + const bool is_signed = output->info()->data_type() == DataType::QASYMM8_SIGNED; + + // Check if symmetric per-channel execution + const bool is_symm = output_stage.is_quantized_per_channel; // key acts as a bitset, setting the first bit on reinterpret_as_3d, // the second on is_bounded_relu, and the third on is_fixed_point. - uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3); - return map_function.find(key)->second; + uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2); + if(is_symm) + { + return map_function_qsymm.find(key)->second; + } + else + { + key |= ((is_signed ? 1UL : 0UL) << 3); + return map_function_qasymm.find(key)->second; + } } } // namespace |