From dbdea0d1c025b18d4d82c278c87454427918f5b4 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 16 Oct 2019 19:21:40 +0100 Subject: COMPMID-2308: NEConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters Change-Id: Ic1bf5f0d21ccd525f84213a360f7e199d7f50577 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/2177 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../kernels/NEConvertQuantizedSignednessKernel.cpp | 136 +++++++ .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 4 +- .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp | 5 +- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 438 +++++++++++++++++---- .../NEON/kernels/NEGEMMLowpReductionKernel.cpp | 275 +++++++------ src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 2 +- src/core/NEON/kernels/NEWeightsReshapeKernel.cpp | 2 +- 7 files changed, 658 insertions(+), 204 deletions(-) create mode 100644 src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp (limited to 'src/core/NEON') diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp new file mode 100644 index 0000000000..39e030e434 --- /dev/null +++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + + // Validate output if initialized + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Output auto inizialitation if not yet initialized + { + const bool is_input_signed = input->data_type() == DataType::QASYMM8_SIGNED; + const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo qinfo = input->quantization_info().uniform(); + const int offset_correction = is_input_signed ? -128 : 128; + const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); + + auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); + } + + return std::make_pair(Status{}, calculate_max_window(*output)); +} +} // namespace + +NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + std::pair win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + return Status{}; +} + +void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + const uint8_t mask = 128; + const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{}); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const uint8_t in = *(reinterpret_cast(input_ptr + x)); + *(output_ptr + x) = in ^ mask; + } + }, + input, output); +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index c929983162..a9c04824ae 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -45,9 +45,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, - DataType::U16, DataType::S16, DataType::U32, DataType::S32, - DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); if(output->total_size() != 0) diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp index 6cec51d5a2..8f5a208cbb 100644 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp @@ -722,8 +722,8 @@ namespace { Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); TensorShape in0_shape = input0->tensor_shape(); @@ -917,6 +917,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo switch(_input0->info()->data_type()) { case DataType::S8: + case DataType::QASYMM8_SIGNED: { matrix_multiply_s8(ina, inb, out, width_b, out_stride, window); break; diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index 46e53cec12..3ada3a3c4f 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -72,6 +72,58 @@ inline int32x4x4_t load(const int32_t *ptr, int32_t x) }; } +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) +{ + return + { + { + vaddq_s32(a.val[0], b), + vaddq_s32(a.val[1], b), + vaddq_s32(a.val[2], b), + vaddq_s32(a.val[3], b) + } + }; +} + +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) +{ + return + { + { + vaddq_s32(a.val[0], b.val[0]), + vaddq_s32(a.val[1], b.val[1]), + vaddq_s32(a.val[2], b.val[2]), + vaddq_s32(a.val[3], b.val[3]) + } + }; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) +{ + return + { + { + vmulq_n_s32(a.val[0], mul_scalar), + vmulq_n_s32(a.val[1], mul_scalar), + vmulq_n_s32(a.val[2], mul_scalar), + vmulq_n_s32(a.val[3], mul_scalar) + } + }; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier) +{ + return + { + { + vmulq_s32(a.val[0], vld1q_s32(multilpier)), + vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), + vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), + vmulq_s32(a.val[3], vld1q_s32(multilpier + 12)) + } + }; +} + inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x) { int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x); @@ -141,6 +193,82 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3 return out_u8; } +template +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +template +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0])); + in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1])); + in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2])); + in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3])); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + inline Window get_win_vector_sum(const Window &window) { Window win_vector_sum(window); @@ -172,50 +300,12 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias) return bias_it; } -inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) -{ - return - { - { - vaddq_s32(a.val[0], b), - vaddq_s32(a.val[1], b), - vaddq_s32(a.val[2], b), - vaddq_s32(a.val[3], b) - } - }; -} - -inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) -{ - return - { - { - vaddq_s32(a.val[0], b.val[0]), - vaddq_s32(a.val[1], b.val[1]), - vaddq_s32(a.val[2], b.val[2]), - vaddq_s32(a.val[3], b.val[3]) - } - }; -} - -inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) -{ - return - { - { - vmulq_n_s32(a.val[0], mul_scalar), - vmulq_n_s32(a.val[1], mul_scalar), - vmulq_n_s32(a.val[2], mul_scalar), - vmulq_n_s32(a.val[3], mul_scalar) - } - }; -} - template inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, int32_t a_offset, int32_t b_offset, int32_t k_offset, - GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x) + int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, + int window_step_x, int window_start_x, int window_end_x) { int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; if(!is_fixed_point) @@ -251,12 +341,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su } if(!is_fixed_point) { - in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier); + in_s32 = mul_s32(in_s32, multiplier); } if(is_fixed_point) { - vst1q_u8(out_it.ptr() + x, finalize_quantization(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8)); + vst1q_u8(out_it.ptr() + x, finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8)); } else { @@ -280,24 +370,99 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, - output_stage.gemmlowp_offset, static_cast(output_stage.gemmlowp_min_bound), static_cast(output_stage.gemmlowp_max_bound)); + *(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, static_cast(min_bound), static_cast(max_bound)); } else { // Finalize quantization - in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift; + in_value = (in_value * multiplier) >> shift; // Bound and store the result if(is_bounded_relu) { - in_value = static_cast(std::max(output_stage.gemmlowp_min_bound, std::min(output_stage.gemmlowp_max_bound, in_value))); + in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); } *(out_it.ptr() + x) = static_cast(std::max(0, std::min(255, in_value))); } } } +template +inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, + const int32_t *result_multipliers, const int32_t *result_shifts, + const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8, + int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound, + int window_step_x, int window_start_x, int window_end_x) +{ + int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; + if(!is_fixed_point) + { + // Combine quantization offset with other offsets. + offset_term_s32 = add_s32(offset_term_s32, result_offset); + } + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = load_results_input(mm_result_it, x); + + if(has_a_offset) + { + in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); + } + if(has_bias) + { + in_s32 = add_s32(in_s32, load(bias_ptr, x)); + } + if(!is_fixed_point) + { + in_s32 = add_s32(in_s32, offset_term_s32); + in_s32 = mul_s32(in_s32, result_multipliers + x); + } + + if(is_fixed_point) + { + vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8)); + } + else + { + vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8)); + } + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t in_value = *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + + if(has_a_offset) + { + in_value += (*(vector_sum_col_ptr + x) * a_offset); + } + if(has_bias) + { + in_value += *(bias_ptr + x); + } + + if(is_fixed_point) + { + // Finalize and store the result + *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast(min_bound), static_cast(max_bound)); + } + else + { + // Finalize quantization + in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]); + + // Bound and store the result + if(is_bounded_relu) + { + in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); + } + *(out_it.ptr() + x) = static_cast(std::max(-128, std::min(127, in_value))); + } + } +} + template void run_offset_contribution_output_stage(const Window &window, const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, @@ -307,10 +472,16 @@ void run_offset_contribution_output_stage(const Window &window, const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; - const int32x4_t result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift); - const uint8x16_t min_u8 = vdupq_n_u8(static_cast(output_stage.gemmlowp_min_bound)); - const uint8x16_t max_u8 = vdupq_n_u8(static_cast(output_stage.gemmlowp_max_bound)); + const int32_t multiplier = output_stage.gemmlowp_multiplier; + const int32_t shift = output_stage.gemmlowp_shift; + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); + const uint8x16_t min_u8 = vdupq_n_u8(static_cast(min_bound)); + const uint8x16_t max_u8 = vdupq_n_u8(static_cast(max_bound)); const int window_step_x = 16; const auto window_start_x = static_cast(window.x().start()); @@ -349,7 +520,8 @@ void run_offset_contribution_output_stage(const Window &window, run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -363,7 +535,8 @@ void run_offset_contribution_output_stage(const Window &window, + id.y() + (id.z() % depth_input) * height_input; run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); } @@ -386,7 +559,8 @@ void run_offset_contribution_output_stage(const Window &window, + id.y() + (id.z() % depth_input) * height_input; run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -399,7 +573,8 @@ void run_offset_contribution_output_stage(const Window &window, + id.y() + (id.z() % depth_input) * height_input; run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, mm_result_it, out_it); } @@ -422,7 +597,8 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, bias_it, mm_result_it, out_it); } @@ -434,7 +610,8 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, mm_result_it, out_it); } @@ -448,7 +625,8 @@ void run_offset_contribution_output_stage(const Window &window, { run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, bias_it, mm_result_it, out_it); } @@ -458,7 +636,110 @@ void run_offset_contribution_output_stage(const Window &window, { run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - output_stage, window_step_x, window_start_x, window_end_x); + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + mm_result_it, out_it); + } + return; + } +} + +template +void run_offset_contribution_output_stage_symm(const Window &window, + const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, + int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, + GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); + + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); + const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int8x16_t min_s8 = vdupq_n_s8(static_cast(min_bound)); + const int8x16_t max_s8 = vdupq_n_s8(static_cast(max_bound)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); + + Iterator mm_result_it(mm_result, win); + Iterator out_it(output, win); + + if(a_offset != 0) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + vector_sum_col_it, mm_result_it, out_it); + } + } + else + { + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); + }, + bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, mm_result_it, out_it); } @@ -470,8 +751,18 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + if(output->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 127); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < -128); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0); + } + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); if(bias != nullptr) @@ -525,7 +816,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto if(output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); } @@ -551,7 +842,7 @@ std::pair validate_and_configure_window(ITensorInfo *mm_result, } NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction -get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage) +get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage) { static std::map map_function = { @@ -562,7 +853,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, { 4, &run_offset_contribution_output_stage }, { 5, &run_offset_contribution_output_stage }, { 6, &run_offset_contribution_output_stage }, - { 7, &run_offset_contribution_output_stage } + { 7, &run_offset_contribution_output_stage_symm }, + { 8, &run_offset_contribution_output_stage_symm }, + { 9, &run_offset_contribution_output_stage_symm }, + { 10, &run_offset_contribution_output_stage_symm }, + { 11, &run_offset_contribution_output_stage_symm }, + { 12, &run_offset_contribution_output_stage_symm }, + { 13, &run_offset_contribution_output_stage_symm }, + { 14, &run_offset_contribution_output_stage_symm }, + { 15, &run_offset_contribution_output_stage_symm } }; // Check if input is a 3D reinterpretation @@ -574,11 +873,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound) && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255)); + // Check if we need to perform fixed point requantization const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; + // Check if symmetric per-channel execution + const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED; + // key acts as a bitset, setting the first bit on reinterpret_as_3d, // the second on is_bounded_relu, and the third on is_fixed_point. - uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2); + uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3); return map_function.find(key)->second; } } // namespace @@ -591,8 +894,9 @@ NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutpu } void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col, - const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) + const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, + int32_t k, int32_t a_offset, int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); @@ -627,7 +931,7 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_ ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); - _function = get_configured_function(mm_result, vector_sum_row, output_stage); + _function = get_configured_function(mm_result, vector_sum_row, output, output_stage); } Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index c1ee770db5..72632492d7 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,13 +27,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include #include #include @@ -48,7 +48,7 @@ namespace { Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); return Status{}; @@ -72,7 +72,7 @@ std::pair validate_and_configure_window_matrix_a_reduction(ITens Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); return Status{}; @@ -128,11 +128,12 @@ Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, cons return Status{}; } -void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) +template +void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t; + using TAcc = wrapper::traits::promote_t; Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); @@ -149,9 +150,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4_t sum_row = vdupq_n_u32(0); + auto sum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); + const T *matrix_a = reinterpret_cast((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); @@ -161,43 +162,41 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 4 accumulations for(; i <= (_k - 4); i += 4) { - const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4); + const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4); - // Convert U8 to U16 - uint16x4x4_t a0_u16 = + // Convert 8-bit to 16-bit + typename wrapper::traits::neon_bitvector::type a0_d16[4] = { - { - vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(a0_u8))) - } + wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))), + wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))), + wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))), + wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8))) }; - // Accumulate to U16 - a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]); - a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]); - a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]); + // Accumulate to 16-bit + a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]); + a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]); + a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]); - // Accumulate to U32 - sum_row = vaddw_u16(sum_row, a0_u16.val[0]); + // Accumulate to 32-bit + sum_row = wrapper::vaddw(sum_row, a0_d16[0]); } // This for loop performs the leftover accumulations for(; i < _k; ++i) { - const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4); + const auto a0_d8 = wrapper::vload(matrix_a + i * 4); // Convert U8 to U16 - const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8)); + const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8)); // Accumulate to U32 - sum_row = vaddw_u16(sum_row, a0_u16); + sum_row = wrapper::vaddw(sum_row, a0_d16); } auto vector_sum_row = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row)); + wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row)); }, in, out); } @@ -206,10 +205,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4_t sum_row_u32 = vdupq_n_u32(0); - uint32_t sum_row = 0; + auto vsum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + TAcc sum_row = 0; - const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); + const T *matrix_a = reinterpret_cast((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); @@ -219,37 +218,57 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 16 accumulations for(; i <= (_k - 16); i += 16) { - const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i); + const auto a0_d8 = wrapper::vloadq(matrix_a + i); // Partial accumulations in U16 - const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8)); + const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); // Accumulate to U32 - sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0)); + vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); } // This for loop performs the leftover accumulations for(; i < _k; ++i) { - sum_row += static_cast(matrix_a[i]); + sum_row += static_cast(matrix_a[i]); } #if defined(__aarch64__) // Reduction operation available on 64 bit architectures only - sum_row += vaddvq_u32(sum_row_u32); + sum_row += wrapper::vaddv(vsum_row); #else // __aarch64__ - uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32)); - tmp = vpadd_u32(tmp, tmp); + auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); + tmp = wrapper::vpadd(tmp, tmp); - sum_row += vget_lane_u32(tmp, 0); + sum_row += wrapper::vgetlane(tmp, 0); #endif // __aarch64__ - *(reinterpret_cast(out.ptr())) = static_cast(sum_row); + *(reinterpret_cast(out.ptr())) = static_cast(sum_row); }, in, out); } } +void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_input->info()->data_type()) + { + case DataType::QASYMM8: + run_internal(window); + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + run_internal(window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } +} + void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) { ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); @@ -276,11 +295,12 @@ Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, cons return Status{}; } -void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) +template +void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t; + using TAcc = wrapper::traits::promote_t; Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); @@ -297,17 +317,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf execute_window_loop(collapsed_window, [&](const Coordinates & id) { // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4x4_t sum_col = + typename wrapper::traits::neon_bitvector::type sum_col[4] = { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) }; - const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]; + const auto *matrix_b = reinterpret_cast(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); @@ -316,35 +334,28 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf int i = 0; for(; i < _k; ++i) { - const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16); + const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16); - // Convert S8 to U16 - const uint16x8x2_t b0_u16 = + // Convert 8bit to 16bit + const typename wrapper::traits::neon_bitvector::type b0_b16[2] = { - { - vmovl_u8(vget_low_u8(b0_u8)), - vmovl_u8(vget_high_u8(b0_u8)) - } + wrapper::vmovl(wrapper::vgetlow(b0_b8)), + wrapper::vmovl(wrapper::vgethigh(b0_b8)) }; // Accumulate to U32 - sum_col = - { - { - vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])), - vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1])) - } - }; + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); } auto vector_sum_col = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0])); - vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1])); - vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2])); - vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3])); + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3])); }, in, out); } @@ -377,17 +388,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf } // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - uint32x4x4_t sum_col = + typename wrapper::traits::neon_bitvector::type sum_col[4] = { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) }; - const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]; + const auto *matrix_b = reinterpret_cast(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); @@ -398,10 +407,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf // This for loop performs 4 accumulations for(; i <= (_k - 4); i += 4) { - const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride); - const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride); - const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride); - const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride); + const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); + const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); + const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); #if __arm__ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); @@ -410,34 +419,27 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); #endif /* __arm__ */ - // Partial accumulation in u16 - uint16x8x2_t tmp_sum = + // Partial accumulation in 16bit + typename wrapper::traits::neon_bitvector::type tmp_sum[2] = { - { - vdupq_n_u16(0), - vdupq_n_u16(0) - } + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) }; - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8)); - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8)); - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8)); - tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8)); - tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8)); - - // Accumulate to U32 - sum_col = - { - { - vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])), - vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])), - vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])), - vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1])) - } - }; + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); matrix_b += 4 * in_b_stride; } @@ -445,38 +447,51 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf // This for loop perfoms the leftover accumulations for(; i < _k; ++i) { - const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride); + const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); // Convert S8 to S16 - const uint16x8x2_t b0_u16 = + const typename wrapper::traits::neon_bitvector::type b0_b16[2] { - { - vmovl_u8(vget_low_u8(b0_u8)), - vmovl_u8(vget_high_u8(b0_u8)) - } + wrapper::vmovl(wrapper::vgetlow(b0_b8)), + wrapper::vmovl(wrapper::vgethigh(b0_b8)) }; - // Accumulate to U32 - sum_col = - { - { - vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])), - vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])), - vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1])) - } - }; + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); matrix_b += in_b_stride; } auto vector_sum_col = reinterpret_cast(out.ptr()); - vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0])); - vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1])); - vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2])); - vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3])); + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3])); }, inb, out); } } + +void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_input->info()->data_type()) + { + case DataType::QASYMM8: + run_internal(window, info); + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + run_internal(window, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } +} diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 0ca7fd3dc8..ea3d32e628 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -55,7 +55,7 @@ TensorShape get_output_shape(const ITensorInfo *input) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp index 624833adfb..649316442e 100644 --- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp @@ -49,7 +49,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); if(biases != nullptr) -- cgit v1.2.1