Make CpuActivation stateless

- Rename NEActivationLayer to CpuActivation - Add member function to generate execution window Partially Resolves: COMPMID-3992 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I4e1ae15cf456b860d3080b2fedc4dbcce7d1bb79 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4791 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2021-01-08 17:25:55 +0000
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-01-11 16:48:31 +0000
commit: f8f0442e9a6105be0e32f4defec5fbc10248ea6e (patch)
tree: d4e77c82f57df175dcec6c46ed2f74f4a8b72d7a /src/core/cpu
parent: 4f77ba9f2dccbae1b46b2d4e17d862560f858050 (diff)
download: ComputeLibrary-f8f0442e9a6105be0e32f4defec5fbc10248ea6e.tar.gz
18 files changed, 2369 insertions, 3 deletions
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
new file mode 100644
index 0000000000..abdba3ae53
--- /dev/null
+++ b/src/core/cpu/kernels/CpuActivationKernel.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuActivationKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/cpu/kernels/activation/list.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+struct ActivationSelectorData
+{
+    DataType dt;
+};
+
+using ActivationSelectorPtr = std::add_pointer<bool(const ActivationSelectorData &data)>::type;
+using ActivationKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+
+struct ActivationKernel
+{
+    const char                 *name;
+    const ActivationSelectorPtr is_selected;
+    ActivationKernelPtr         ukernel;
+};
+
+static const ActivationKernel available_kernels[] =
+{
+#if defined(__ARM_FEATURE_SVE)
+    {
+        "fp16_sve_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
+    },
+    {
+        "fp32_sve_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
+    },
+#else  /* !defined(__ARM_FEATURE_SVE) */
+    {
+        "fp16_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
+    },
+    {
+        "fp32_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
+    },
+#endif /* defined(__ARM_FEATURE_SVE)  */
+
+#if defined(__ARM_FEATURE_SVE2) /* defined(__ARM_FEATURE_SVE2) */
+    {
+        "qasymm8_sve_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
+    },
+    {
+        "qasymm8_signed_sve_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
+    },
+    {
+        "qsymm16_sve_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
+        REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
+    },
+#else  /* !defined(__ARM_FEATURE_SVE2) */
+    {
+        "qasymm8_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
+    },
+    {
+        "qasymm8_signed_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
+    },
+    {
+        "qsymm16_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
+        REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
+    },
+#endif /* defined(__ARM_FEATURE_SVE2) */
+};
+
+const ActivationKernel *get_implementation(const ActivationSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
+/* Supported activation in the 8-bit integer domain */
+static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_activations =
+{
+    ActivationLayerInfo::ActivationFunction::RELU,
+    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
+};
+/* Supported activation in the 16-bit integer domain */
+static const std::array<ActivationLayerInfo::ActivationFunction, 3> qsymm16_activations =
+{
+    ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::HARD_SWISH
+};
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+
+    const auto *uk = get_implementation(ActivationSelectorData{ input->data_type() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    const DataType                                data_type = input->data_type();
+    const QuantizationInfo                       &oq_info   = (output != nullptr) ? output->quantization_info() : input->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act     = activation_info.activation();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
+                                    "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
+                                    "For QSYMM16 only tanh and logistic are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
+                                && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                                && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output)
+{
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, *input->clone());
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+    }
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+void CpuActivationKernel::configure(const ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo activation_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _act_info = activation_info;
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, activation_info));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICPPKernel::configure(win_config.second);
+}
+
+Status CpuActivationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+
+    return Status{};
+}
+
+void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    // Early exit on disabled activation
+    if(!_act_info.enabled())
+    {
+        return;
+    }
+
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const auto *uk = get_implementation(ActivationSelectorData{ src->info()->data_type() });
+
+    uk->ukernel(src, dst, _act_info, window);
+}
+
+const char *CpuActivationKernel::name() const
+{
+    return "CpuActivationKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuActivationKernel.h b/src/core/cpu/kernels/CpuActivationKernel.h
new file mode 100644
index 0000000000..083915ba9f
--- /dev/null
+++ b/src/core/cpu/kernels/CpuActivationKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
+#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the activation kernel */
+class CpuActivationKernel : public ICpuKernel
+{
+public:
+    CpuActivationKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel);
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in, out] src             Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]      activation_info Activation layer information.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
+     *
+     * @param[in] src      Source tensor info. In case of @p dst tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[in] dst      Destination tensor info. Data type supported: same as @p src
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    ActivationLayerInfo _act_info{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */
diff --git a/src/core/cpu/kernels/floor/CpuFloorKernel.cpp b/src/core/cpu/kernels/CpuFloorKernel.cpp
index 738f04d14a..6115b69907 100644
--- a/src/core/cpu/kernels/floor/CpuFloorKernel.cpp
+++ b/src/core/cpu/kernels/CpuFloorKernel.cpp
@@ -32,7 +32,7 @@
 #include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/floor/impl/list.h"
+#include "src/core/cpu/kernels/floor/list.h"
 
 namespace arm_compute
 {
diff --git a/src/core/cpu/kernels/CpuFloorKernel.h b/src/core/cpu/kernels/CpuFloorKernel.h
index dc3a9d5ff1..25d78c7870 100644
--- a/src/core/cpu/kernels/CpuFloorKernel.h
+++ b/src/core/cpu/kernels/CpuFloorKernel.h
@@ -63,8 +63,8 @@ public:
     Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) final;
-    const char *name() const final;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/core/cpu/kernels/activation/NEON/fp16.cpp b/src/core/cpu/kernels/activation/NEON/fp16.cpp
new file mode 100644
index 0000000000..27ae2830cc
--- /dev/null
+++ b/src/core/cpu/kernels/activation/NEON/fp16.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/NEON/NEMath.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+#ifndef __aarch64__
+inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
+{
+    auto int_in = vreinterpretq_u16_f16(in);
+    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
+}
+#endif /* __arch64__ */
+} // namespace
+
+void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
+    const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+    constexpr int window_step_x  = 8;
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    // In case of non-aarch64, a small delta value is added to the input
+    // to prevent NAN values caused by zeros in inputs to SQRT.
+    // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+    const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {});
+#endif /* __aarch64 */
+
+                                       const auto const_1     = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType {});
+                                       const auto const_0     = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+                                       const auto const_6     = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
+                                       const auto const_3     = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
+                                       const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
+
+                                       const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
+                                       const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
+                                       const auto a  = static_cast<float16_t>(act_info.a());
+                                       const auto b  = static_cast<float16_t>(act_info.b());
+                                       execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = wrapper::vabs(vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = wrapper::vmla(vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = wrapper::vmax(const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+                    tmp = wrapper::vsqrt(vin);
+#else  /* aarch64 */
+                    {
+                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
+                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                    }
+#endif /* aarch64 */
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = wrapper::vmul(vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
+            float16_t       tmp;
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = std::abs(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = a * in + b;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = (in > 0) ? in : a * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = std::log(static_cast<float16_t>(1) + std::exp(in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = std::sqrt(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = in * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = a * std::tanh(b * in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/NEON/fp32.cpp b/src/core/cpu/kernels/activation/NEON/fp32.cpp
new file mode 100644
index 0000000000..0687646be7
--- /dev/null
+++ b/src/core/cpu/kernels/activation/NEON/fp32.cpp
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+#ifndef __aarch64__
+inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
+{
+    auto int_in = vreinterpretq_u32_f32(in);
+    return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
+}
+#endif /* __arch64__ */
+} // namespace
+
+void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
+
+    constexpr int                                 window_step_x  = 4;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    // In case of non-aarch64, a small delta value is added to the input
+    // to prevent NAN values caused by zeros in inputs to SQRT.
+    // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+    const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
+#endif /* __aarch64 */
+    const auto const_1     = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
+    const auto const_0     = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+    const auto const_6     = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
+    const auto const_3     = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
+    const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
+
+    const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
+    const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
+    const auto a  = static_cast<float>(act_info.a());
+    const auto b  = static_cast<float>(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = wrapper::vabs(vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = wrapper::vmla(vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = wrapper::vmax(const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+                    tmp = wrapper::vsqrt(vin);
+#else  /* aarch64 */
+                    {
+                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
+                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                    }
+#endif /* aarch64 */
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = wrapper::vmul(vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const float in = *(reinterpret_cast<const float *>(input_ptr + x));
+            float       tmp;
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = std::abs(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = a * in + b;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = std::max<float>(static_cast<float>(0), in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = std::min<float>(a, std::max<float>(b, in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = (in > 0) ? in : a * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = std::log(static_cast<float>(1) + std::exp(in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = std::sqrt(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = in * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = a * std::tanh(b * in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/NEON/qasymm8.cpp b/src/core/cpu/kernels/activation/NEON/qasymm8.cpp
new file mode 100644
index 0000000000..7506a8294f
--- /dev/null
+++ b/src/core/cpu/kernels/activation/NEON/qasymm8.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const qasymm8x16_t            va       = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+    const qasymm8x16_t            vb       = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+    const qasymm8_t               a        = quantize_qasymm8(act_info.a(), qi_in);
+    const qasymm8_t               b        = quantize_qasymm8(act_info.b(), qi_in);
+    const qasymm8_t               const_0  = quantize_qasymm8(0.f, qi_in);
+    const qasymm8x16_t            vconst_0 = vdupq_n_u8(const_0);
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+#ifndef __aarch64__
+    const auto vconst_0_f32 = vdupq_n_f32(0);
+#endif // __aarch64__
+    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
+    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
+    const float       a_f32           = act_info.a();
+    const float       b_f32           = act_info.b();
+    const auto        const_6_f32     = vdupq_n_f32(6.f);
+    const auto        const_0_f32     = vdupq_n_f32(0.f);
+    const auto        const_3_f32     = vdupq_n_f32(3.f);
+    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = vmaxq_u8(vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgtz(vin_deq.val[0]),
+                        wrapper::vcgtz(vin_deq.val[1]),
+                        wrapper::vcgtz(vin_deq.val[2]),
+                        wrapper::vcgtz(vin_deq.val[3]),
+                    }
+                };
+#else  // __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+                    }
+                };
+#endif // __aarch64__
+
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+                    }
+                };
+
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+            qasymm8_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                tmp = std::max(const_0, in);
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(const_0, in));
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(b, in));
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
new file mode 100644
index 0000000000..8f75abea8e
--- /dev/null
+++ b/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const qasymm8x16_signed_t     va       = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+    const qasymm8x16_signed_t     vb       = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+    const qasymm8_signed_t        a        = quantize_qasymm8_signed(act_info.a(), qi_in);
+    const qasymm8_signed_t        b        = quantize_qasymm8_signed(act_info.b(), qi_in);
+    const qasymm8_signed_t        const_0  = quantize_qasymm8_signed(0.f, qi_in);
+    const qasymm8x16_signed_t     vconst_0 = vdupq_n_s8(const_0);
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+#ifndef __aarch64__
+    const auto vconst_0_f32 = vdupq_n_f32(1.f);
+#endif // __aarch64__
+    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
+    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
+    const float       a_f32           = act_info.a();
+    const float       b_f32           = act_info.b();
+    const auto        const_6_f32     = vdupq_n_f32(6.f);
+    const auto        const_0_f32     = vdupq_n_f32(0.f);
+    const auto        const_3_f32     = vdupq_n_f32(3.f);
+    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = vmaxq_s8(vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgtz(vin_deq.val[0]),
+                        wrapper::vcgtz(vin_deq.val[1]),
+                        wrapper::vcgtz(vin_deq.val[2]),
+                        wrapper::vcgtz(vin_deq.val[3]),
+                    }
+                };
+#else  // __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+                    }
+                };
+#endif // __aarch64__
+
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+                    }
+                };
+
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+            qasymm8_signed_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                tmp = std::max(const_0, in);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(const_0, in));
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(b, in));
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/NEON/qsymm16.cpp b/src/core/cpu/kernels/activation/NEON/qsymm16.cpp
new file mode 100644
index 0000000000..9eee360427
--- /dev/null
+++ b/src/core/cpu/kernels/activation/NEON/qsymm16.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 8;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+    const float32x4_t             va_f32   = vdupq_n_f32(act_info.a());
+    const float32x4_t             vb_f32   = vdupq_n_f32(act_info.b());
+    const float                   a_f32    = act_info.a();
+    const float                   b_f32    = act_info.b();
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+        ARM_COMPUTE_UNUSED(tmp);
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                // Perform activation
+                const float32x4x2_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_int16(tmp_dep, qi_out.scale);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                // Perform activation
+                const float32x4x2_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_int16(tmp_dep, qi_out.scale);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+            qsymm16_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qsymm16(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qsymm16(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/SVE/fp16.cpp b/src/core/cpu/kernels/activation/SVE/fp16.cpp
new file mode 100644
index 0000000000..8208813cd3
--- /dev/null
+++ b/src/core/cpu/kernels/activation/SVE/fp16.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE)
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const auto const_1     = svdup_n_f16(1.f);
+    const auto const_0     = svdup_n_f16(0.f);
+    const auto const_6     = svdup_n_f16(6.f);
+    const auto const_3     = svdup_n_f16(3.f);
+    const auto const_inv_6 = svdup_n_f16(0.166666667f);
+
+    const auto va = svdup_n_f16(act_info.a());
+    const auto vb = svdup_n_f16(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+        svfloat16_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b16(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_f16(pg, input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = svabs_f16_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = svmla_f16_z(pg, vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = svmax_f16_z(pg, const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = svsqrt_f16_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = svmul_f16_z(pg, vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            svst1_f16(pg, output_ptr + x, tmp);
+
+            x += svcnth();
+            pg = svwhilelt_b16(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b16(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // __ARM_FEATURE_SVE
+\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/SVE/fp32.cpp b/src/core/cpu/kernels/activation/SVE/fp32.cpp
new file mode 100644
index 0000000000..55bdc9999e
--- /dev/null
+++ b/src/core/cpu/kernels/activation/SVE/fp32.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const auto const_1     = svdup_n_f32(1.f);
+    const auto const_0     = svdup_n_f32(0.f);
+    const auto const_6     = svdup_n_f32(6.f);
+    const auto const_3     = svdup_n_f32(3.f);
+    const auto const_inv_6 = svdup_n_f32(0.166666667f);
+
+    const auto va = svdup_n_f32(act_info.a());
+    const auto vb = svdup_n_f32(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+        svfloat32_t tmp;
+
+        // Compute S elements per iteration
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b32(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_f32(pg, input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = svabs_f32_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = svmla_f32_z(pg, vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = svmax_f32_z(pg, const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = svsqrt_f32_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = svmul_f32_z(pg, vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            svst1_f32(pg, output_ptr + x, tmp);
+
+            x += svcntw();
+            pg = svwhilelt_b32(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b32(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // __ARM_FEATURE_SVE
+\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/SVE/qasymm8.cpp b/src/core/cpu/kernels/activation/SVE/qasymm8.cpp
new file mode 100644
index 0000000000..9eea3ace9e
--- /dev/null
+++ b/src/core/cpu/kernels/activation/SVE/qasymm8.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
+    const auto                    va              = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+    const auto                    vb              = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+    const auto                    const_0         = quantize_qasymm8(0.f, qi_in);
+    const auto                    vconst_0        = svdup_n_u8(const_0);
+    const auto                    vconst_1        = svdup_n_f32(1.f);
+    const auto                    va_f32          = svdup_n_f32(act_info.a());
+    const auto                    vb_f32          = svdup_n_f32(act_info.b());
+    const auto                    const_6_f32     = svdup_n_f32(6.f);
+    const auto                    const_0_f32     = svdup_n_f32(0.f);
+    const auto                    const_3_f32     = svdup_n_f32(3.f);
+    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    bool requant = true;
+    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    {
+        requant = false;
+    }
+    float s  = qi_in.scale / qi_out.scale;
+    float o  = -qi_in.offset * s + qi_out.offset;
+    auto  vs = svdup_n_f32(s);
+    auto  vo = svdup_n_f32(o);
+
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto voffset_in = svdup_n_s32(qi_in.offset);
+    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_s32     = svdup_n_s32(s_s32);
+    const auto vo_s32     = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+        svuint8_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b8(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_u8(pg, input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = svmax_u8_z(pg, vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+                // Re-quantize to new output space
+                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                svbool_t    p0, p1, p2, p3;
+                svint32x4_t tmp_dep;
+
+                // Expand to int32
+                const svint32x4_t vin_s32 =
+                {
+                    { {
+                            svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+                            svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+                            svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+                            svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))),
+                        }
+                    }
+                };
+
+                // Compare elements to input offset
+                if(qi_in.scale >= 0)
+                {
+                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+                else
+                {
+                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+
+                // Multiply negative elements and requantize if necessary
+                if(requant)
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                }
+                else
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                }
+
+                // Convert uint32 vectors to uint16 vectors (with saturation)
+                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                // convert uint16 vectors to uint8 vectors (with saturation)
+                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+
+            svst1_u8(pg, output_ptr + x, tmp);
+
+            x += svcntb();
+            pg = svwhilelt_b8(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b8(), pg));
+
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
+\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
new file mode 100644
index 0000000000..0b3d798942
--- /dev/null
+++ b/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
+    const auto                    va              = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+    const auto                    vb              = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+    const auto                    const_0         = quantize_qasymm8_signed(0.f, qi_in);
+    const auto                    vconst_0        = svdup_n_s8(const_0);
+    const auto                    vconst_1        = svdup_n_f32(1.f);
+    const auto                    va_f32          = svdup_n_f32(act_info.a());
+    const auto                    vb_f32          = svdup_n_f32(act_info.b());
+    const auto                    const_6_f32     = svdup_n_f32(6.f);
+    const auto                    const_0_f32     = svdup_n_f32(0.f);
+    const auto                    const_3_f32     = svdup_n_f32(3.f);
+    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    bool requant = true;
+    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    {
+        requant = false;
+    }
+    float s  = qi_in.scale / qi_out.scale;
+    float o  = -qi_in.offset * s + qi_out.offset;
+    auto  vs = svdup_n_f32(s);
+    auto  vo = svdup_n_f32(o);
+
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto voffset_in = svdup_n_s32(qi_in.offset);
+    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_s32     = svdup_n_s32(s_s32);
+    const auto vo_s32     = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+        svint8_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b8(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_s8(pg, input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = svmax_s8_z(pg, vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                svbool_t    p0, p1, p2, p3;
+                svint32x4_t tmp_dep;
+
+                // Expand to int32
+                const svint32x4_t vin_s32 =
+                {
+                    { {
+                            svmovlb_s32(svmovlb_s16(vin)),
+                            svmovlt_s32(svmovlb_s16(vin)),
+                            svmovlb_s32(svmovlt_s16(vin)),
+                            svmovlt_s32(svmovlt_s16(vin)),
+                        }
+                    }
+                };
+
+                // Compare elements to input offset
+                if(qi_in.scale >= 0)
+                {
+                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+                else
+                {
+                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+
+                // Multiply negative elements and requantize if necessary
+                if(requant)
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                }
+                else
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                }
+
+                // Convert uint32 vectors to uint16 vectors (with saturation)
+                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                // convert uint16 vectors to uint8 vectors (with saturation)
+                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+
+            svst1_s8(pg, output_ptr + x, tmp);
+
+            x += svcntb();
+            pg = svwhilelt_b8(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b8(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/SVE/qsymm16.cpp b/src/core/cpu/kernels/activation/SVE/qsymm16.cpp
new file mode 100644
index 0000000000..dbaf267bf9
--- /dev/null
+++ b/src/core/cpu/kernels/activation/SVE/qsymm16.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/SVESymm.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    vconst_1 = svdup_n_f32(1.f);
+    const auto                    va_f32   = svdup_n_f32(act_info.a());
+    const auto                    vb_f32   = svdup_n_f32(act_info.b());
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        svint16_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b16(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_s16(pg, input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                // Perform activation
+                const svfloat32x2_t tmp_dep =
+                {
+                    { {
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                // Perform activation
+                const svfloat32x2_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+
+            svst1_s16(pg, output_ptr + x, tmp);
+
+            x += svcnth();
+            pg = svwhilelt_b16(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b16(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/list.h b/src/core/cpu/kernels/activation/list.h
new file mode 100644
index 0000000000..409d025db0
--- /dev/null
+++ b/src/core/cpu/kernels/activation/list.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
+#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ACTIVATION_KERNEL(func_name) \
+    void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+
+DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation);
+DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation);
+DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation);
+DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
+DECLARE_ACTIVATION_KERNEL(fp16_sve_activation);
+DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
+DECLARE_ACTIVATION_KERNEL(fp32_sve_activation);
+
+#undef DECLARE_ACTIVATION_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */
diff --git a/src/core/cpu/kernels/floor/impl/NEON/fp16.cpp b/src/core/cpu/kernels/floor/NEON/fp16.cpp
index 0d31eb77f8..0d31eb77f8 100644
--- a/src/core/cpu/kernels/floor/impl/NEON/fp16.cpp
+++ b/src/core/cpu/kernels/floor/NEON/fp16.cpp
diff --git a/src/core/cpu/kernels/floor/impl/NEON/fp32.cpp b/src/core/cpu/kernels/floor/NEON/fp32.cpp
index dd63f9f9d7..dd63f9f9d7 100644
--- a/src/core/cpu/kernels/floor/impl/NEON/fp32.cpp
+++ b/src/core/cpu/kernels/floor/NEON/fp32.cpp
diff --git a/src/core/cpu/kernels/floor/impl/list.h b/src/core/cpu/kernels/floor/list.h
index 4367e0ffc9..4367e0ffc9 100644
--- a/src/core/cpu/kernels/floor/impl/list.h
+++ b/src/core/cpu/kernels/floor/list.h
author	Georgios Pinitas <georgios.pinitas@arm.com>	2021-01-08 17:25:55 +0000
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-01-11 16:48:31 +0000
commit	f8f0442e9a6105be0e32f4defec5fbc10248ea6e (patch)
tree	d4e77c82f57df175dcec6c46ed2f74f4a8b72d7a /src/core/cpu
parent	4f77ba9f2dccbae1b46b2d4e17d862560f858050 (diff)
download	ComputeLibrary-f8f0442e9a6105be0e32f4defec5fbc10248ea6e.tar.gz