From 41eb2d92c89274200d59ff97653e2bd66819b310 Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Thu, 23 Jun 2022 16:02:05 +0100 Subject: Improve LUT Neon Hard-Swish * Changed window_step from 16 to tensor_shape().x() when calling into the assembly byte substitution code. * Resolve COMPMID-5211 Change-Id: I5c1f5273455999bb35f94c76a8afb4290e728858 Signed-off-by: Pablo Marquez Tello Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7843 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Gian Marco Iodice --- .../kernels/activation/generic/neon/qasymm8.cpp | 25 ++++------------------ 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp index f35d0d298f..29f5e6b376 100644 --- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp @@ -421,33 +421,16 @@ void neon_qasymm8_hardswish_lut(const ITensor *src, ITensor *dst, const Activati { ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::HARD_SWISH); #ifdef __aarch64__ - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - + const int window_step_x = src->info()->tensor_shape().x(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto input_ptr = reinterpret_cast(input.ptr() + x); - auto output_ptr = reinterpret_cast(output.ptr() + x); - substitute_bytes_neon(act_info.lut().data(), 1u, window_step_x, &input_ptr, &output_ptr); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto input_ptr = reinterpret_cast(input.ptr() + x); - auto output_ptr = reinterpret_cast(output.ptr() + x); - substitute_bytes_neon(act_info.lut().data(), 1u, 1u, &input_ptr, &output_ptr); - } + const auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + substitute_bytes_neon(act_info.lut().data(), 1u, window_step_x, &input_ptr, &output_ptr); }, input, output); #else // #ifdef __aarch64__ -- cgit v1.2.1