aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormorgolock <pablo.tello@arm.com>2020-09-25 12:03:21 +0100
committerPablo Marquez <pablo.tello@arm.com>2020-10-02 13:06:39 +0000
commitc229e8cadf0d7d088f17f02619eb4dd8af94244f (patch)
tree23abf7504cc6467233f95216de635b0dd0fc2ca3
parentb84f9d34dbb857ad6113c0c89ad109498fa75fe5 (diff)
downloadComputeLibrary-c229e8cadf0d7d088f17f02619eb4dd8af94244f.tar.gz
COMPMID-3183: Removed padding NEGEMMLowpReductionKernel
Change-Id: Ibf7741ffdefcceb9683c919e79302fc35c36ea65 Signed-off-by: morgolock <pablo.tello@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4031 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h1
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp479
2 files changed, 162 insertions, 318 deletions
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 53a542c2df..f41941f796 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -63,7 +63,6 @@ protected:
const ITensor *_input;
ITensor *_output;
int32_t _k;
- bool _is_reshaped;
int32_t _scalar;
bool _mul_by_scalar;
};
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 29453072a1..3ac2efc397 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -45,26 +45,6 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITe
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
-{
- const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, TensorShape(input->dimension(1)), 1, DataType::S32);
-
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -77,31 +57,10 @@ Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITe
}
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32);
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
- : _input(), _output(), _k(0), _is_reshaped(false), _scalar(0), _mul_by_scalar(false)
+ : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false)
{
}
@@ -109,26 +68,27 @@ void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+ ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-
_input = mtx_a;
_output = vector_sum_row;
_k = info.k;
- _is_reshaped = info.is_reshaped;
_scalar = info.scalar;
_mul_by_scalar = info.mul_by_scalar;
- // Configure kernel window
- auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32);
+
+ Window win = calculate_max_window(*_output->info(), Steps(1));
+ _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
}
Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), info.is_reshaped).first);
-
return Status{};
}
@@ -149,121 +109,55 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w
Iterator in(_input, win_input);
Iterator out(_output, collapsed_window);
- const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
-
- if(_is_reshaped)
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+ auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+ TAcc sum_row = 0;
- const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
+ const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
#endif /* __arm__ */
- int i = 0;
- // This for loop performs 4 accumulations
- for(; i <= (_k - 4); i += 4)
- {
- const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
-
- // Convert 8-bit to 16-bit
- typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
- {
- wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
- wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
- wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
- wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
- };
-
- // Accumulate to 16-bit
- a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
- a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
- a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
-
- // Accumulate to 32-bit
- sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
- }
-
- // This for loop performs the leftover accumulations
- for(; i < _k; ++i)
- {
- const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
-
- // Convert U8 to U16
- const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
-
- // Accumulate to U32
- sum_row = wrapper::vaddw(sum_row, a0_d16);
- }
-
- // Multiply by scalar if necessary
- if(_mul_by_scalar)
- {
- sum_row = wrapper::vmul(sum_row, vec_scalar);
- }
-
- auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
-
- wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row));
- },
- in, out);
- }
- else // it is not reshaped
- {
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ int i = 0;
+ // This for loop performs 16 accumulations
+ for(; i <= (_k - 16); i += 16)
{
- // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
- TAcc sum_row = 0;
-
- const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
-
-#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
-#endif /* __arm__ */
+ const auto a0_d8 = wrapper::vloadq(matrix_a + i);
- int i = 0;
- // This for loop performs 16 accumulations
- for(; i <= (_k - 16); i += 16)
- {
- const auto a0_d8 = wrapper::vloadq(matrix_a + i);
+ // Partial accumulations in U16
+ const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
- // Partial accumulations in U16
- const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
+ // Accumulate to U32
+ vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
+ }
- // Accumulate to U32
- vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
- }
-
- // This for loop performs the leftover accumulations
- for(; i < _k; ++i)
- {
- sum_row += static_cast<TAcc>(matrix_a[i]);
- }
+ // This for loop performs the leftover accumulations
+ for(; i < _k; ++i)
+ {
+ sum_row += static_cast<TAcc>(matrix_a[i]);
+ }
#if defined(__aarch64__)
- // Reduction operation available on 64 bit architectures only
- sum_row += wrapper::vaddv(vsum_row);
+ // Reduction operation available on 64 bit architectures only
+ sum_row += wrapper::vaddv(vsum_row);
#else // __aarch64__
- auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
- tmp = wrapper::vpadd(tmp, tmp);
+ auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+ tmp = wrapper::vpadd(tmp, tmp);
- sum_row += wrapper::vgetlane(tmp, 0);
+ sum_row += wrapper::vgetlane(tmp, 0);
#endif // __aarch64__
- // Multiply by scalar if necessary
- if(_mul_by_scalar)
- {
- sum_row *= _scalar;
- }
+ // Multiply by scalar if necessary
+ if(_mul_by_scalar)
+ {
+ sum_row *= _scalar;
+ }
- *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
- },
- in, out);
- }
+ *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
+ },
+ in, out);
}
void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
@@ -290,26 +184,32 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+ ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
+
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
_input = mtx_b;
_output = vector_sum_col;
_k = info.k;
- _is_reshaped = info.is_reshaped;
_scalar = info.scalar;
_mul_by_scalar = info.mul_by_scalar;
// Configure kernel window
- auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32);
+
+ // Configure kernel window
+ Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration));
+ _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+ INEKernel::configure(win);
}
Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
return Status{};
}
@@ -321,198 +221,143 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
using TIAcc = wrapper::traits::promote_t<T>;
using TAcc = wrapper::traits::promote_t<TIAcc>;
- Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
- const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
-
- if(_is_reshaped)
- {
- Window win_input(collapsed_window);
- win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Iterator in(_input, win_input);
- Iterator out(_output, collapsed_window);
-
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
- {
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
- };
-
- const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+ Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
+ const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
-#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-#endif /* __arm__ */
+ const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
- int i = 0;
- for(; i < _k; ++i)
- {
- const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
+ // The implementation computes 16 elements per iteration
+ const int window_start_x = 16 * info.thread_id;
+ const int window_step_x = 16 * info.num_threads;
+ // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
- // Convert 8bit to 16bit
- const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
- {
- wrapper::vmovl(wrapper::vgetlow(b0_b8)),
- wrapper::vmovl(wrapper::vgethigh(b0_b8))
- };
-
- // Accumulate to U32
- sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
- sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
- sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
- sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
- }
+ Window win_out(collapsed_window);
+ win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
- // Multiply by scalar if necessary
- if(_mul_by_scalar)
- {
- sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
- sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
- sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
- sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
- }
+ Window win_in(win_out);
+ win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+ Iterator inb(_input, win_in);
+ Iterator out(_output, win_out);
- wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
- wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
- wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
- wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
- },
- in, out);
- }
- else // it is not reshaped
+ execute_window_loop(win_out, [&](const Coordinates & id)
{
- const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
- const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
-
- // The implementation computes 16 elements per iteration
- const int window_start_x = 16 * info.thread_id;
- const int window_step_x = 16 * info.num_threads;
- // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
- const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
- Window win_out(collapsed_window);
- win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-
- Window win_in(win_out);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Iterator inb(_input, win_in);
- Iterator out(_output, win_out);
-
- execute_window_loop(win_out, [&](const Coordinates & id)
+ if(id.x() > width_matrix_b)
{
- if(id.x() > width_matrix_b)
- {
- return;
- }
+ return;
+ }
- // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
- {
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
- };
+ // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+ typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
+ {
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
+ };
- const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
+ const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
#endif /* __arm__ */
- int i = 0;
- // This for loop performs 4 accumulations
- for(; i <= (_k - 4); i += 4)
- {
- const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
- const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
- const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
- const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
+ int i = 0;
+ // This for loop performs 4 accumulations
+ for(; i <= (_k - 4); i += 4)
+ {
+ const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+ const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+ const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+ const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
#if __arm__
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
#endif /* __arm__ */
- // Partial accumulation in 16bit
- typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
- {
- wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
- };
-
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
-
- // Accumulate to 32bit
- sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
- sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
- sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
- sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
-
- matrix_b += 4 * in_b_stride;
- }
-
- // This for loop perfoms the leftover accumulations
- for(; i < _k; ++i)
+ // Partial accumulation in 16bit
+ typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
{
- const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-
- // Convert S8 to S16
- const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
- {
- wrapper::vmovl(wrapper::vgetlow(b0_b8)),
- wrapper::vmovl(wrapper::vgethigh(b0_b8))
- };
-
- // Accumulate to 32bit
- sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
- sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
- sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
- sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
+ };
- matrix_b += in_b_stride;
- }
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+
+ matrix_b += 4 * in_b_stride;
+ }
+
+ // This for loop perfoms the leftover accumulations
+ for(; i < _k; ++i)
+ {
+ const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
- // Multiply by scalar if necessary
- if(_mul_by_scalar)
+ // Convert S8 to S16
+ const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
{
- sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
- sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
- sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
- sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
- }
+ wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+ wrapper::vmovl(wrapper::vgethigh(b0_b8))
+ };
+
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
- auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+ matrix_b += in_b_stride;
+ }
+ // Multiply by scalar if necessary
+ if(_mul_by_scalar)
+ {
+ sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+ sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+ sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+ sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+ }
+
+ auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+ if(id.x() + 16 < width_matrix_b)
+ {
wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
- },
- inb, out);
- }
+ }
+ else
+ {
+ auto left_over = width_matrix_b - id.x();
+ for(auto k = 0; k < 4 && left_over; ++k)
+ {
+ for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ {
+ *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+ }
+ }
+ }
+ },
+ inb, out);
}
void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)