aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Marquez Tello <pablo.tello@arm.com>2022-06-23 16:02:05 +0100
committerPablo Marquez Tello <pablo.tello@arm.com>2022-06-24 09:17:19 +0000
commit41eb2d92c89274200d59ff97653e2bd66819b310 (patch)
tree2cb2d9022e16ac8780bba3ab308f73aa15ce5811
parent700b913ed9257c44147372378bc8a0dadcfd2ac2 (diff)
downloadComputeLibrary-41eb2d92c89274200d59ff97653e2bd66819b310.tar.gz
Improve LUT Neon Hard-Swish
* Changed window_step from 16 to tensor_shape().x() when calling into the assembly byte substitution code. * Resolve COMPMID-5211 Change-Id: I5c1f5273455999bb35f94c76a8afb4290e728858 Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7843 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--src/cpu/kernels/activation/generic/neon/qasymm8.cpp25
1 files changed, 4 insertions, 21 deletions
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index f35d0d298f..29f5e6b376 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -421,33 +421,16 @@ void neon_qasymm8_hardswish_lut(const ITensor *src, ITensor *dst, const Activati
{
ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::HARD_SWISH);
#ifdef __aarch64__
- constexpr int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
+ const int window_step_x = src->info()->tensor_shape().x();
Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
-
execute_window_loop(win_collapsed, [&](const Coordinates &)
{
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr() + x);
- auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr() + x);
- substitute_bytes_neon(act_info.lut().data(), 1u, window_step_x, &input_ptr, &output_ptr);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr() + x);
- auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr() + x);
- substitute_bytes_neon(act_info.lut().data(), 1u, 1u, &input_ptr, &output_ptr);
- }
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ substitute_bytes_neon(act_info.lut().data(), 1u, window_step_x, &input_ptr, &output_ptr);
},
input, output);
#else // #ifdef __aarch64__