From c229e8cadf0d7d088f17f02619eb4dd8af94244f Mon Sep 17 00:00:00 2001 From: morgolock Date: Fri, 25 Sep 2020 12:03:21 +0100 Subject: COMPMID-3183: Removed padding NEGEMMLowpReductionKernel Change-Id: Ibf7741ffdefcceb9683c919e79302fc35c36ea65 Signed-off-by: morgolock Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4031 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- .../core/NEON/kernels/NEGEMMLowpReductionKernel.h | 1 - .../NEON/kernels/NEGEMMLowpReductionKernel.cpp | 479 +++++++-------------- 2 files changed, 162 insertions(+), 318 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h index 53a542c2df..f41941f796 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h @@ -63,7 +63,6 @@ protected: const ITensor *_input; ITensor *_output; int32_t _k; - bool _is_reshaped; int32_t _scalar; bool _mul_by_scalar; }; diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index 29453072a1..3ac2efc397 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -45,26 +45,6 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITe } return Status{}; } -std::pair validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped) -{ - const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1; - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output, TensorShape(input->dimension(1)), 1, DataType::S32); - - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -77,31 +57,10 @@ Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITe } return Status{}; } - -std::pair validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output) -{ - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32); - - // Configure kernel window - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} } // namespace INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() - : _input(), _output(), _k(0), _is_reshaped(false), _scalar(0), _mul_by_scalar(false) + : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false) { } @@ -109,26 +68,27 @@ void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor * { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); - _input = mtx_a; _output = vector_sum_row; _k = info.k; - _is_reshaped = info.is_reshaped; _scalar = info.scalar; _mul_by_scalar = info.mul_by_scalar; - // Configure kernel window - auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + // Output auto initialization if not yet initialized + auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32); + + Window win = calculate_max_window(*_output->info(), Steps(1)); + _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape())); + + INEKernel::configure(win); } Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) { + ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), info.is_reshaped).first); - return Status{}; } @@ -149,121 +109,55 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w Iterator in(_input, win_input); Iterator out(_output, collapsed_window); - const auto vec_scalar = wrapper::vdup_n(static_cast(_scalar), wrapper::traits::vector_128_tag{}); - - if(_is_reshaped) + execute_window_loop(collapsed_window, [&](const Coordinates & id) { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - auto sum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + auto vsum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + TAcc sum_row = 0; - const T *matrix_a = reinterpret_cast((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); + const T *matrix_a = reinterpret_cast((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); #endif /* __arm__ */ - int i = 0; - // This for loop performs 4 accumulations - for(; i <= (_k - 4); i += 4) - { - const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4); - - // Convert 8-bit to 16-bit - typename wrapper::traits::neon_bitvector::type a0_d16[4] = - { - wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))), - wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))), - wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))), - wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8))) - }; - - // Accumulate to 16-bit - a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]); - a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]); - a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]); - - // Accumulate to 32-bit - sum_row = wrapper::vaddw(sum_row, a0_d16[0]); - } - - // This for loop performs the leftover accumulations - for(; i < _k; ++i) - { - const auto a0_d8 = wrapper::vload(matrix_a + i * 4); - - // Convert U8 to U16 - const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8)); - - // Accumulate to U32 - sum_row = wrapper::vaddw(sum_row, a0_d16); - } - - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_row = wrapper::vmul(sum_row, vec_scalar); - } - - auto vector_sum_row = reinterpret_cast(out.ptr()); - - wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row)); - }, - in, out); - } - else // it is not reshaped - { - execute_window_loop(collapsed_window, [&](const Coordinates & id) + int i = 0; + // This for loop performs 16 accumulations + for(; i <= (_k - 16); i += 16) { - // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - auto vsum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - TAcc sum_row = 0; - - const T *matrix_a = reinterpret_cast((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); -#endif /* __arm__ */ + const auto a0_d8 = wrapper::vloadq(matrix_a + i); - int i = 0; - // This for loop performs 16 accumulations - for(; i <= (_k - 16); i += 16) - { - const auto a0_d8 = wrapper::vloadq(matrix_a + i); + // Partial accumulations in U16 + const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); - // Partial accumulations in U16 - const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); + // Accumulate to U32 + vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); + } - // Accumulate to U32 - vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); - } - - // This for loop performs the leftover accumulations - for(; i < _k; ++i) - { - sum_row += static_cast(matrix_a[i]); - } + // This for loop performs the leftover accumulations + for(; i < _k; ++i) + { + sum_row += static_cast(matrix_a[i]); + } #if defined(__aarch64__) - // Reduction operation available on 64 bit architectures only - sum_row += wrapper::vaddv(vsum_row); + // Reduction operation available on 64 bit architectures only + sum_row += wrapper::vaddv(vsum_row); #else // __aarch64__ - auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); - tmp = wrapper::vpadd(tmp, tmp); + auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); + tmp = wrapper::vpadd(tmp, tmp); - sum_row += wrapper::vgetlane(tmp, 0); + sum_row += wrapper::vgetlane(tmp, 0); #endif // __aarch64__ - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_row *= _scalar; - } + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_row *= _scalar; + } - *(reinterpret_cast(out.ptr())) = static_cast(sum_row); - }, - in, out); - } + *(reinterpret_cast(out.ptr())) = static_cast(sum_row); + }, + in, out); } void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) @@ -290,26 +184,32 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); _input = mtx_b; _output = vector_sum_col; _k = info.k; - _is_reshaped = info.is_reshaped; _scalar = info.scalar; _mul_by_scalar = info.mul_by_scalar; // Configure kernel window - auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Output auto initialization if not yet initialized + auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32); + + // Configure kernel window + Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration)); + _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape())); + INEKernel::configure(win); } Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first); return Status{}; } @@ -321,198 +221,143 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const using TIAcc = wrapper::traits::promote_t; using TAcc = wrapper::traits::promote_t; - Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); - - const auto vec_scalar = wrapper::vdup_n(static_cast(_scalar), wrapper::traits::vector_128_tag{}); - - if(_is_reshaped) - { - Window win_input(collapsed_window); - win_input.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_input.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_input.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator in(_input, win_input); - Iterator out(_output, collapsed_window); - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - typename wrapper::traits::neon_bitvector::type sum_col[4] = - { - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) - }; - - const auto *matrix_b = reinterpret_cast(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); + Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); + const auto vec_scalar = wrapper::vdup_n(static_cast(_scalar), wrapper::traits::vector_128_tag{}); -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); -#endif /* __arm__ */ + const auto width_matrix_b = static_cast(_input->info()->dimension(0)); + const auto in_b_stride = static_cast(_input->info()->strides_in_bytes()[1]); - int i = 0; - for(; i < _k; ++i) - { - const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16); + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - // Convert 8bit to 16bit - const typename wrapper::traits::neon_bitvector::type b0_b16[2] = - { - wrapper::vmovl(wrapper::vgetlow(b0_b8)), - wrapper::vmovl(wrapper::vgethigh(b0_b8)) - }; - - // Accumulate to U32 - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); - } + Window win_out(collapsed_window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); - sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); - sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); - sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); - } + Window win_in(win_out); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - auto vector_sum_col = reinterpret_cast(out.ptr()); + Iterator inb(_input, win_in); + Iterator out(_output, win_out); - wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); - wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); - wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); - wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); - }, - in, out); - } - else // it is not reshaped + execute_window_loop(win_out, [&](const Coordinates & id) { - const auto width_matrix_b = static_cast(_input->info()->dimension(0)); - const auto in_b_stride = static_cast(_input->info()->strides_in_bytes()[1]); - - // The implementation computes 16 elements per iteration - const int window_start_x = 16 * info.thread_id; - const int window_step_x = 16 * info.num_threads; - // Make sure (window_end_x - window_start_x) is a multiple of window_step_x - const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - - Window win_out(collapsed_window); - win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - - Window win_in(win_out); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator inb(_input, win_in); - Iterator out(_output, win_out); - - execute_window_loop(win_out, [&](const Coordinates & id) + if(id.x() > width_matrix_b) { - if(id.x() > width_matrix_b) - { - return; - } + return; + } - // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - typename wrapper::traits::neon_bitvector::type sum_col[4] = - { - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) - }; + // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation + typename wrapper::traits::neon_bitvector::type sum_col[4] = + { + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) + }; - const auto *matrix_b = reinterpret_cast(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]); + const auto *matrix_b = reinterpret_cast(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); #endif /* __arm__ */ - int i = 0; - // This for loop performs 4 accumulations - for(; i <= (_k - 4); i += 4) - { - const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); - const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); - const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); + int i = 0; + // This for loop performs 4 accumulations + for(; i <= (_k - 4); i += 4) + { + const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); + const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); + const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); #if __arm__ - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); #endif /* __arm__ */ - // Partial accumulation in 16bit - typename wrapper::traits::neon_bitvector::type tmp_sum[2] = - { - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) - }; - - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); - - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); - - matrix_b += 4 * in_b_stride; - } - - // This for loop perfoms the leftover accumulations - for(; i < _k; ++i) + // Partial accumulation in 16bit + typename wrapper::traits::neon_bitvector::type tmp_sum[2] = { - const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - - // Convert S8 to S16 - const typename wrapper::traits::neon_bitvector::type b0_b16[2] - { - wrapper::vmovl(wrapper::vgetlow(b0_b8)), - wrapper::vmovl(wrapper::vgethigh(b0_b8)) - }; - - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) + }; - matrix_b += in_b_stride; - } + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); + + matrix_b += 4 * in_b_stride; + } + + // This for loop perfoms the leftover accumulations + for(; i < _k; ++i) + { + const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - // Multiply by scalar if necessary - if(_mul_by_scalar) + // Convert S8 to S16 + const typename wrapper::traits::neon_bitvector::type b0_b16[2] { - sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); - sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); - sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); - sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); - } + wrapper::vmovl(wrapper::vgetlow(b0_b8)), + wrapper::vmovl(wrapper::vgethigh(b0_b8)) + }; + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); - auto vector_sum_col = reinterpret_cast(out.ptr()); + matrix_b += in_b_stride; + } + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); + sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); + sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); + sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); + } + + auto vector_sum_col = reinterpret_cast(out.ptr()); + if(id.x() + 16 < width_matrix_b) + { wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); - }, - inb, out); - } + } + else + { + auto left_over = width_matrix_b - id.x(); + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vector_sum_col + k * 4 + j) = sum_col[k][j]; + } + } + } + }, + inb, out); } void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) -- cgit v1.2.1