aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/activation
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/activation')
-rw-r--r--src/cpu/kernels/activation/generic/neon/fp16.cpp43
-rw-r--r--src/cpu/kernels/activation/generic/neon/fp32.cpp39
-rw-r--r--src/cpu/kernels/activation/generic/neon/impl.h246
-rw-r--r--src/cpu/kernels/activation/generic/neon/lut.cpp57
-rw-r--r--src/cpu/kernels/activation/generic/neon/qasymm8.cpp310
-rw-r--r--src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp274
-rw-r--r--src/cpu/kernels/activation/generic/neon/qsymm16.cpp156
-rw-r--r--src/cpu/kernels/activation/generic/sve/fp16.cpp173
-rw-r--r--src/cpu/kernels/activation/generic/sve/fp32.cpp146
-rw-r--r--src/cpu/kernels/activation/generic/sve2/lut.cpp57
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8.cpp240
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp274
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qsymm16.cpp128
-rw-r--r--src/cpu/kernels/activation/list.h54
14 files changed, 2197 insertions, 0 deletions
diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..ddc6dc24cd
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8};
+} // namespace
+
+void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ fp_neon_activation_impl<float16_t, Fp16Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..e558f8c73e
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4};
+} // namespace
+void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ fp_neon_activation_impl<float, Fp32Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h
new file mode 100644
index 0000000000..afeb6f7f3d
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+namespace arm_compute
+{
+namespace cpu
+{
+/** Constant parameters needed by the activation implementation.
+ * These parameters differ for each floating type
+ *
+ * @note This are passed as a struct as C++ does not allow float as a template parameter until C++20
+ **/
+struct ActFpImplParams
+{
+ float delta; /**< Minimum delta needed to avoid NaN on corner-cases of elementary functions */
+ int step_x; /**< Window step at the x dimension */
+};
+
+#ifndef __aarch64__
+inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
+{
+ auto int_in = vreinterpretq_u32_f32(in);
+ return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
+}
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
+{
+ auto int_in = vreinterpretq_u16_f16(in);
+ return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
+}
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#endif /* __aarch64__ */
+
+template <typename T, const ActFpImplParams &P>
+void fp_neon_activation_impl(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ /** SIMD vector tag type. */
+ using ExactTagType =
+ typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+ constexpr int window_step_x = P.step_x;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ // In case of non-aarch64, a small delta value is added to the input
+ // to prevent NAN values caused by zeros in inputs to SQRT.
+ // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+ const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{});
+#else /* #ifndef __aarch64__ */
+ const auto const_inv_2 = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
+ const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
+#endif /* __aarch64__ */
+ const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+ const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ const auto const_6 = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
+ const auto const_3 = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
+ const auto const_inv_6 = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{});
+ constexpr float soft_relu_thresh = 12.f;
+ const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{});
+ const auto va = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{});
+ const auto vb = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
+ const auto a = static_cast<T>(act_info.a());
+ const auto b = static_cast<T>(act_info.b());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = wrapper::vabs(vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = wrapper::vmla(vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = wrapper::vmax(const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin,
+ wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
+ wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+ tmp = wrapper::vsqrt(vin);
+#else /* __aarch64__ */
+ {
+ const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
+ tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+ tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
+ }
+#endif /* __aarch64__ */
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = wrapper::vmul(vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = wrapper::vmul(
+ vin,
+ wrapper::vmul(const_inv_6,
+ wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(
+ const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
+ break;
+#ifdef __aarch64__
+ case ActivationLayerInfo::ActivationFunction::GELU:
+ tmp = wrapper::vmul(
+ vin,
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
+ break;
+#endif /* __aarch64__ */
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+ T tmp;
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = std::abs(in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = a * in + b;
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = std::max<T>(static_cast<T>(0), in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = std::min<T>(a, std::max<T>(b, in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = (in > 0) ? in : a * in;
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = std::sqrt(in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = in * in;
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = a * std::tanh(b * in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = in;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = in / (static_cast<T>(1) + std::exp(-a * in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::GELU:
+ tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
+ }
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
new file mode 100644
index 0000000000..ddd186f9cb
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifdef __aarch64__
+void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+ (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+ act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
+ const auto window_end_x = window.x().end();
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+ },
+ input, output);
+}
+#endif // __aarch64__
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..1451301ea2
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ constexpr int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+ const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+ const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in);
+ const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
+ const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
+ const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
+ const auto vconst_1 = vdupq_n_f32(1.f);
+
+#ifndef __aarch64__
+ const auto vconst_0_f32 = vdupq_n_f32(0);
+#else // #ifndef __aarch64__
+ const auto const_inv_2 = vdupq_n_f32(0.5f);
+ const auto const_inv_sqrt_2 = vdupq_n_f32(0.70710678118f);
+#endif // __aarch64__
+ const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
+ const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
+ const float a_f32 = act_info.a();
+ const float b_f32 = act_info.b();
+
+#ifndef __aarch64__
+ const auto const_6_f32 = vdupq_n_f32(6.f);
+ const auto const_0_f32 = vdupq_n_f32(0.f);
+ const auto const_3_f32 = vdupq_n_f32(3.f);
+ const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+#endif // __aarch64__
+
+ // Initialise scale/offset for re-quantization
+ float s = qi_in.scale / qi_out.scale;
+ float o = -qi_in.offset * s + qi_out.offset;
+ float32x4_t vs = vdupq_n_f32(s);
+ float32x4_t vo = vdupq_n_f32(o);
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+ wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = vmaxq_u8(vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
+#endif // __aarch64__
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(
+ vin_deq.val[0],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[1],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[2],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[3],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ const auto vin_deq = vdequantize(vin, qi_in);
+
+ const uint32x4x4_t pos_mask = {{
+ wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+ }};
+
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+ wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+ wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+ wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+ }};
+
+ tmp = vquantize(tmp_dep, qi_out);
+ }
+#else // #ifndef __aarch64__
+ else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+ {
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(vin_deq.val[0],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[0], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[1],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[1], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[2],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[2], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[3],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[3], const_inv_sqrt_2))))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
+#endif // __aarch64__
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+ qasymm8_t tmp = 0;
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ tmp = std::max(const_0, in);
+ tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(const_0, in));
+ tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(b, in));
+ tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+ }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
+#endif // __aarch64__
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
+#endif // __aarch64__
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
+ }
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..a2f588245a
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_signed_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ constexpr int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+ const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+ const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in);
+ const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in);
+ const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in);
+ const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0);
+#ifndef __aarch64__
+ const auto vconst_1 = vdupq_n_f32(1.f);
+ const auto vconst_0_f32 = vdupq_n_f32(0.f);
+#endif // __aarch64__
+ const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
+ const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
+ const float a_f32 = act_info.a();
+ const float b_f32 = act_info.b();
+ const auto const_6_f32 = vdupq_n_f32(6.f);
+ const auto const_0_f32 = vdupq_n_f32(0.f);
+ const auto const_3_f32 = vdupq_n_f32(3.f);
+ const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+ // Initialise scale/offset for re-quantization
+ float s = qi_in.scale / qi_out.scale;
+ float o = -qi_in.offset * s + qi_out.offset;
+ float32x4_t vs = vdupq_n_f32(s);
+ float32x4_t vo = vdupq_n_f32(o);
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+
+ wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = vmaxq_s8(vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
+#endif // __aarch64__
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(
+ vin_deq.val[0],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[1],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[2],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[3],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+ const uint32x4x4_t pos_mask = {{
+ wrapper::vcgtz(vin_deq.val[0]),
+ wrapper::vcgtz(vin_deq.val[1]),
+ wrapper::vcgtz(vin_deq.val[2]),
+ wrapper::vcgtz(vin_deq.val[3]),
+ }};
+#else // __aarch64__
+ const uint32x4x4_t pos_mask = {{
+ wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+ }};
+#endif // __aarch64__
+
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+ wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+ wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+ wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+ }};
+
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+ qasymm8_signed_t tmp = 0;
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ tmp = std::max(const_0, in);
+ tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(const_0, in));
+ tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(b, in));
+ tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+ }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
+#endif // __aarch64__
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
+ }
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
new file mode 100644
index 0000000000..891646ea00
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qsymm16_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ constexpr int window_step_x = 8;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const auto vconst_1 = vdupq_n_f32(1.f);
+ const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
+ const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
+ const float a_f32 = act_info.a();
+ const float b_f32 = act_info.b();
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+
+ wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+ ARM_COMPUTE_UNUSED(tmp);
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+ // Perform activation
+ const float32x4x2_t tmp_dep = {{
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_int16(tmp_dep, qi_out.scale);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+ // Perform activation
+ const float32x4x2_t tmp_dep = {{
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_int16(tmp_dep, qi_out.scale);
+ }
+
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // De-quantize
+ const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+ // Perform activation
+ const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
+ wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}};
+ // Re-quantize to new output space
+ tmp = vquantize_int16(tmp_dep, qi_out.scale);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+ qsymm16_t tmp = 0;
+ if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+ tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+ tmp = quantize_qsymm16(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+ tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+ tmp = quantize_qsymm16(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+ tmp_f = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
+ tmp = quantize_qsymm16(tmp_f, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
+ }
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
new file mode 100644
index 0000000000..19d9126556
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2020-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/cpu/kernels/lut/list.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const auto const_1 = svdup_n_f16(1.f);
+ const auto const_0 = svdup_n_f16(0.f);
+ const auto const_6 = svdup_n_f16(6.f);
+ const auto const_3 = svdup_n_f16(3.f);
+ const auto const_inv_6 = svdup_n_f16(0.166666667f);
+
+ const auto va = svdup_n_f16(act_info.a());
+ const auto vb = svdup_n_f16(act_info.b());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+ svfloat16_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_f16(pg, input_ptr + x);
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = svabs_f16_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = svmla_f16_z(pg, vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = svmax_f16_z(pg, const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
+ svmax_f16_z(pg, vin, const_0));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
+ svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = svsqrt_f16_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = svmul_f16_z(pg, vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = svmul_f16_z(
+ pg, vin,
+ svmul_f16_z(
+ pg, const_inv_6,
+ svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = svmul_f16_z(
+ pg, vin,
+ svinv_f16_z(pg, svadd_f16_z(pg, const_1,
+ svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ svst1_f16(pg, output_ptr + x, tmp);
+
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
+
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
+}
+
+void sve_fp16_activation_lut(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::F16);
+ const auto window_start_x = window.x().start();
+ const auto window_end_x = window.x().end();
+ const auto size = window_end_x - window_start_x;
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
+ auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+ lut_u16_sve(reinterpret_cast<const uint16_t *>(act_info.lut_fp16().data()), 1U /* num_strings (UNUSED) */,
+ size, input_ptr + window_start_x, output_ptr + window_start_x);
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
new file mode 100644
index 0000000000..d1b075d52c
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEMath.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const auto const_1 = svdup_n_f32(1.f);
+ const auto const_0 = svdup_n_f32(0.f);
+ const auto const_6 = svdup_n_f32(6.f);
+ const auto const_3 = svdup_n_f32(3.f);
+ const auto const_inv_6 = svdup_n_f32(0.166666667f);
+ const auto soft_relu_thresh = svdup_n_f32(16.63553047f);
+
+ const auto va = svdup_n_f32(act_info.a());
+ const auto vb = svdup_n_f32(act_info.b());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+ svfloat32_t tmp;
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b32(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_f32(pg, input_ptr + x);
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = svabs_f32_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = svmla_f32_z(pg, vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = svmax_f32_z(pg, const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va),
+ svmax_f32_z(pg, vin, const_0));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin,
+ svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin,
+ svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = svsqrt_f32_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = svmul_f32_z(pg, vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = svmul_f32_z(
+ pg, vin,
+ svmul_f32_z(
+ pg, const_inv_6,
+ svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = svmul_f32_z(
+ pg, vin,
+ svinv_f32_z(pg, svadd_f32_z(pg, const_1,
+ svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ svst1_f32(pg, output_ptr + x, tmp);
+
+ x += svcntw();
+ pg = svwhilelt_b32(x, window_end_x);
+
+ } while (svptest_any(svptrue_b32(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
new file mode 100644
index 0000000000..5db8595a75
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifdef __aarch64__
+void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+ (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+ act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
+ const auto window_end_x = window.x().end();
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = input.ptr();
+ auto output_ptr = output.ptr();
+ lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+ },
+ input, output);
+}
+#endif // __aarch64__
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
new file mode 100644
index 0000000000..7efa9e4b72
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qasymm8_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const auto va = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+ const auto vb = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+ const auto const_0 = quantize_qasymm8(0.f, qi_in);
+ const auto vconst_0 = svdup_n_u8(const_0);
+ const auto vconst_1 = svdup_n_f32(1.f);
+ const auto va_f32 = svdup_n_f32(act_info.a());
+ const auto vb_f32 = svdup_n_f32(act_info.b());
+
+ // Initialise scale/offset for re-quantization
+ bool requant = true;
+ if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ {
+ requant = false;
+ }
+ float s = qi_in.scale / qi_out.scale;
+ float o = -qi_in.offset * s + qi_out.offset;
+ auto vs = svdup_n_f32(s);
+ auto vo = svdup_n_f32(o);
+
+ // Initialise scale/offset for re-quantization with int32_t
+ const auto voffset_in = svdup_n_s32(qi_in.offset);
+ int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ const auto vs_s32 = svdup_n_s32(s_s32);
+ const auto vo_s32 = svdup_n_s32(o_s32);
+
+ // Initialise scale/offset for re-quantization for leaky relu
+ int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+ arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+ const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ svuint8_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_u8(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = svmax_u8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ svbool_t p0, p1, p2, p3;
+ svint32x4_t tmp_dep;
+
+ // Expand to int32
+ const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+ svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+ svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+ svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
+
+ // Compare elements to input offset
+ if (qi_in.scale >= 0)
+ {
+ p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+ else
+ {
+ p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+
+ // Multiply negative elements and requantize if necessary
+ if (requant)
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+ svsel(p0, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+ svsel(p1, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+ svsel(p2, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+ svsel(p3, vs_leaky_s32, vs_s32)),
+ 8));
+ }
+ else
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ }
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+ const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+
+ svst1_u8(pg, output_ptr + x, tmp);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..e4667522dd
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qasymm8_signed_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const auto va = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+ const auto vb = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+ const auto const_0 = quantize_qasymm8_signed(0.f, qi_in);
+ const auto vconst_0 = svdup_n_s8(const_0);
+ const auto vconst_1 = svdup_n_f32(1.f);
+ const auto va_f32 = svdup_n_f32(act_info.a());
+ const auto vb_f32 = svdup_n_f32(act_info.b());
+ const auto const_6_f32 = svdup_n_f32(6.f);
+ const auto const_0_f32 = svdup_n_f32(0.f);
+ const auto const_3_f32 = svdup_n_f32(3.f);
+ const auto const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+ // Initialise scale/offset for re-quantization
+ bool requant = true;
+ if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ {
+ requant = false;
+ }
+ float s = qi_in.scale / qi_out.scale;
+ float o = -qi_in.offset * s + qi_out.offset;
+ auto vs = svdup_n_f32(s);
+ auto vo = svdup_n_f32(o);
+
+ // Initialise scale/offset for re-quantization with int32_t
+ const auto voffset_in = svdup_n_s32(qi_in.offset);
+ int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ const auto vs_s32 = svdup_n_s32(s_s32);
+ const auto vo_s32 = svdup_n_s32(o_s32);
+
+ // Initialise scale/offset for re-quantization for leaky relu
+ int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+ arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+ const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ svint8_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_s8(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = svmax_s8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, svget4_f32(vin_deq, 0),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 0),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 1),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 1),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 2),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 2),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 3),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 3),
+ const_3_f32))))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ svbool_t p0, p1, p2, p3;
+ svint32x4_t tmp_dep;
+
+ // Expand to int32
+ const svint32x4_t vin_s32 =
+ svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
+ svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
+
+ // Compare elements to input offset
+ if (qi_in.scale >= 0)
+ {
+ p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+ else
+ {
+ p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+
+ // Multiply negative elements and requantize if necessary
+ if (requant)
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+ svsel(p0, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+ svsel(p1, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+ svsel(p2, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+ svsel(p3, vs_leaky_s32, vs_s32)),
+ 8));
+ }
+ else
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ }
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+ const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+
+ svst1_s8(pg, output_ptr + x, tmp);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
new file mode 100644
index 0000000000..f955893307
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/SVESymm.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qsymm16_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const auto vconst_1 = svdup_n_f32(1.f);
+ const auto va_f32 = svdup_n_f32(act_info.a());
+ const auto vb_f32 = svdup_n_f32(act_info.b());
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ svint16_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_s16(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep = svcreate2_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep = svcreate2_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep =
+ svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
+ svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+
+ svst1_s16(pg, output_ptr + x, tmp);
+
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
+
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h
new file mode 100644
index 0000000000..8c24adc3fe
--- /dev/null
+++ b/src/cpu/kernels/activation/list.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
+#define ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ACTIVATION_KERNEL(func_name) \
+ void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+
+#ifdef __aarch64__
+DECLARE_ACTIVATION_KERNEL(neon_q8_activation_lut);
+#endif // __aarch64__
+DECLARE_ACTIVATION_KERNEL(sve2_q8_activation_lut);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation_lut);
+DECLARE_ACTIVATION_KERNEL(sve_fp32_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
+
+#undef DECLARE_ACTIVATION_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H