347 files changed, 63169 insertions, 0 deletions
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
new file mode 100644
index 0000000000..7cfa39b286
--- /dev/null
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuActivationKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/activation/list.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
+#ifdef ARM_COMPUTE_ENABLE_SVE
+    {"sve2_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+                data.cpumodel == CPUModel::A510 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
+#endif // ARM_COMPUTE_ENABLE_SVE
+#ifdef __aarch64__
+    {// Neon LUT implementantion takes precedence
+     "neon_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
+     },
+     REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
+#endif // __aarch64__
+    {"sve2_qu8_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)},
+    {"sve2_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)},
+    {"sve2_qs16_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QSYMM16 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+    {"sve_fp16_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve &&
+                data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
+    {"sve_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)},
+    {"sve_fp32_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)},
+    {"neon_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)},
+    {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)},
+    {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)},
+    {"neon_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)},
+    {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)},
+};
+
+/* Supported activation in the 8-bit integer domain */
+static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = {
+    ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,   ActivationLayerInfo::ActivationFunction::GELU,
+};
+/* Supported activation in the 16-bit integer domain */
+static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = {
+    ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+
+    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized_asymmetric(data_type) &&
+            (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) ==
+             std::end(qasymm8_activations)),
+        "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
+                                        (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations),
+                                                   f_act) == std::end(qsymm16_activations)),
+                                    "For QSYMM16 only tanh and logistic are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+    // Checks performed when dst is configured
+    if ((dst != nullptr) && (dst->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
+{
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    if (dst != nullptr)
+    {
+        // dst auto inizialitation if not yet initialized
+        auto_init_if_empty(*dst, *src->clone());
+    }
+
+    return std::make_pair(Status{}, win);
+}
+#ifdef __aarch64__
+void init_lut(ActivationLayerInfo::ActivationFunction act_func,
+              DataType                                data_type,
+              const UniformQuantizationInfo          &qi_in,
+              const UniformQuantizationInfo          &qi_out,
+              ActivationLayerInfo::LookupTable256    &lut,
+              float                                   a,
+              float                                   b)
+{
+    for (size_t i = 0; i < lut.size(); ++i)
+    {
+        float tmp_f =
+            (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
+        switch (act_func)
+        {
+            case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                break;
+            case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a;
+                break;
+            case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+                break;
+            case ActivationLayerInfo::ActivationFunction::ABS:
+                tmp_f = std::abs(tmp_f);
+                break;
+            case ActivationLayerInfo::ActivationFunction::LINEAR:
+                tmp_f = a * tmp_f + b;
+                break;
+            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                tmp_f = std::min<>(a, std::max(0.f, tmp_f));
+                break;
+            case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                tmp_f = std::min<>(a, std::max<>(b, tmp_f));
+                break;
+            case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                tmp_f = (tmp_f > 12.f) ? tmp_f : std::log(1.f + std::exp(tmp_f));
+                break;
+            case ActivationLayerInfo::ActivationFunction::ELU:
+                tmp_f = (tmp_f >= 0) ? tmp_f : a * (std::exp(tmp_f) - 1);
+                break;
+            case ActivationLayerInfo::ActivationFunction::SQRT:
+                tmp_f = std::sqrt(tmp_f);
+                break;
+            case ActivationLayerInfo::ActivationFunction::SQUARE:
+                tmp_f = tmp_f * tmp_f;
+                break;
+            case ActivationLayerInfo::ActivationFunction::TANH:
+                tmp_f = a * std::tanh(b * tmp_f);
+                break;
+            case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                break;
+            case ActivationLayerInfo::ActivationFunction::SWISH:
+                tmp_f = tmp_f / (1.f + std::exp(-a * tmp_f));
+                break;
+            case ActivationLayerInfo::ActivationFunction::GELU:
+                tmp_f = tmp_f * (0.5f * (1.0f + erff(tmp_f / 1.41421356237f)));
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+                tmp_f = 0;
+                break;
+        }
+        lut[i] =
+            (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
+    }
+}
+#endif // __aarch64__
+} // namespace
+
+void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
+{
+    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
+
+    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
+    if (dst != nullptr)
+    {
+        // dst auto inizialitation if not yet initialized
+        auto_init_if_empty(*dst, *src->clone());
+    }
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
+
+#ifdef __aarch64__
+    // Initialise lut_manager
+    LUTManager &lut_manager = LUTManager::get_instance();
+
+    if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) &&
+        activation_info.activation() != ActivationFunction::RELU)
+    {
+        ActivationLayerInfo::LookupTable256 tmp_lut;
+        init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(),
+                 (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut,
+                 activation_info.a(), activation_info.b());
+        activation_info.setLookupTable256(tmp_lut);
+    }
+
+    if (src->data_type() == DataType::F16 &&
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+    {
+        const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()};
+        activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
+    }
+#endif // __aarch64__
+    _act_info = activation_info;
+
+    Window win;
+
+    // Use squashed window
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+    ICPPKernel::configure(win);
+}
+
+Status
+CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
+
+    return Status{};
+}
+
+size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+
+    if (_split_dimension == Window::DimX)
+    {
+        // Don't split the work load too small if the tensor has been reinterpreted as 1D.
+        // This number is loosely chosen as threading overhead in each platform varies wildly.
+        return 1536;
+    }
+    return default_mws;
+}
+
+void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    // Early exit on disabled activation
+    if (!_act_info.enabled())
+    {
+        return;
+    }
+
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    _run_method(src, dst, _act_info, window);
+}
+
+const char *CpuActivationKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuActivationKernel::ActivationKernel> &CpuActivationKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
new file mode 100644
index 0000000000..c1487499d6
--- /dev/null
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/core/helpers/LUTManager.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the activation kernel */
+class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
+{
+private:
+    using ActivationKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+
+public:
+    CpuActivationKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in, out] src             Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]      activation_info Activation layer information.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuActivationKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+     *
+     * @return The split dimension hint.
+     */
+    size_t get_split_dimension_hint() const
+    {
+        return _split_dimension;
+    }
+
+    struct ActivationKernel
+    {
+        const char                                *name;
+        const ActivationDataTypeISASelectorDataPtr is_selected;
+        ActivationKernelPtr                        ukernel;
+    };
+
+    static const std::vector<ActivationKernel> &get_available_kernels();
+
+private:
+    ActivationLayerInfo _act_info{};
+    ActivationKernelPtr _run_method{nullptr};
+    size_t              _split_dimension{Window::DimY};
+    std::string         _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
new file mode 100644
index 0000000000..a990aa4715
--- /dev/null
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuAddKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/add/list.h"
+
+#include <array>
+
+#if defined(ENABLE_FP32_KERNELS)
+namespace
+{
+static constexpr size_t default_mws_N1_fp32_neon = 24536;
+static constexpr size_t default_mws_V1_fp32_neon = 40510;
+} // namespace
+#endif /* ENABLE_FP32_KERNELS */
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuAddKernel::AddKernel> available_kernels = {
+    {"neon_qu8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)},
+    {"neon_qs8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)},
+    {"sve2_qu8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)},
+    {"sve2_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)},
+    {"sve2_qs16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)},
+    {"sve_fp32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)},
+    {"sve_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)},
+    {"sve_u8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)},
+    {"sve_s16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)},
+    {"sve_s32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)},
+    {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)},
+    {"neon_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)},
+    {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)},
+    {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)},
+    {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)},
+    {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)},
+    {"neon_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)},
+    {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}};
+
+Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (src0.tensor_shape().x() != src1.tensor_shape().x()) &&
+            ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) ||
+             (src1.data_type() != dst.data_type())),
+        "Broadcasting across width is supported on configurations where all tensors have the same data type");
+
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+    }
+
+    const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+} // namespace
+
+void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
+
+    const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst);
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+    _policy     = policy;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuAddKernel").append("/").append(uk->name);
+
+    // Auto initialize dst if not initialized
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape());
+    set_shape_if_empty(*dst, out_shape);
+    set_data_type_if_unknown(*dst, src0->data_type());
+
+    // Configure kernel window
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src0, *src1);
+
+    ICpuKernel::configure(win);
+}
+
+Status
+CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
+
+    return Status{};
+}
+
+void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    _run_method(src0, src1, dst, _policy, window);
+}
+
+const char *CpuAddKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuAddKernel::AddKernel> &CpuAddKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if (this->_run_method == &add_fp32_neon)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if (platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_mws_N1_fp32_neon;
+        }
+        else if (platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if (this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else  /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
new file mode 100644
index 0000000000..4adba8bb16
--- /dev/null
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ADD_KERNEL_H
+#define ARM_COMPUTE_CPU_ADD_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform addition between two tensors */
+class CpuAddKernel : public ICpuKernel<CpuAddKernel>
+{
+private:
+    using AddKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+
+public:
+    struct AddKernel
+    {
+        const char                                  *name;
+        const CpuAddKernelDataTypeISASelectorDataPtr is_selected;
+        AddKernelPtr                                 ukernel;
+    };
+
+    CpuAddKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel);
+    /** Initialise the kernel's input, dst and border mode.
+     *
+     * Valid configurations (src0,src1) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]  src0   First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  src1   Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] dst    The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuAddKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    static const std::vector<AddKernel> &get_available_kernels();
+
+    size_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
+private:
+    ConvertPolicy _policy{};
+    AddKernelPtr  _run_method{nullptr};
+    std::string   _name{};
+    size_t        _split_dimension{Window::DimY};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ADD_KERNEL_H */
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp
new file mode 100644
index 0000000000..6a632e8702
--- /dev/null
+++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuAddMulAddKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/addmuladd/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = {
+#ifdef __aarch64__
+    {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)},
+    {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)},
+    {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)},
+    {"neon_qasymm8_signed_add_mul_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)}
+#endif // __aarch64__
+};
+
+Status validate_arguments(const ITensorInfo         *input1,
+                          const ITensorInfo         *input2,
+                          const ITensorInfo         *bn_mul,
+                          const ITensorInfo         *bn_add,
+                          const ITensorInfo         *add_output,
+                          const ITensorInfo         *final_output,
+                          ConvertPolicy              policy,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(policy != ConvertPolicy::SATURATE, "Only Saturate Policy is supported");
+
+    using ActFunction          = ActivationLayerInfo::ActivationFunction;
+    const ActFunction act_func = act_info.activation();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU &&
+                                     act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
+                                    "Only RELU Family activations, or no activation, is supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+
+    if (is_data_type_quantized(input1->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, bn_mul);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, bn_add);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0],
+                                    "First dimensions of inputs and batchNorm coefs should match");
+
+    // Validate in case we have add layer's output (intermediate) initialized
+    if (add_output != nullptr && add_output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output);
+    }
+
+    // Validate in case final output has been initialized
+    if (final_output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output);
+    }
+
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+} // namespace
+
+void CpuAddMulAddKernel::configure(const ITensorInfo         *input1,
+                                   const ITensorInfo         *input2,
+                                   const ITensorInfo         *bn_mul,
+                                   const ITensorInfo         *bn_add,
+                                   ITensorInfo               *add_output,
+                                   ITensorInfo               *final_output,
+                                   ConvertPolicy              policy,
+                                   const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+
+    _policy     = policy;
+    _act_info   = act_info;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuAddMulAddKernel/").append(uk->name);
+
+    // Auto initialize outputs if not initialized
+    set_shape_if_empty(*final_output, input1->tensor_shape());
+    set_data_type_if_unknown(*final_output, input1->data_type());
+
+    if (add_output != nullptr)
+    {
+        set_shape_if_empty(*add_output, input1->tensor_shape());
+        set_data_type_if_unknown(*add_output, input1->data_type());
+    }
+
+    // Configure kernel window
+    Window win;
+    win = calculate_max_window(*final_output, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuAddMulAddKernel::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *bn_mul,
+                                    const ITensorInfo         *bn_add,
+                                    const ITensorInfo         *add_output,
+                                    const ITensorInfo         *final_output,
+                                    ConvertPolicy              policy,
+                                    const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+
+    return Status{};
+}
+
+void CpuAddMulAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *input1       = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *input2       = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const ITensor *bn_mul       = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const ITensor *bn_add       = tensors.get_const_tensor(TensorType::ACL_SRC_3);
+    ITensor       *add_output   = tensors.get_tensor(TensorType::ACL_DST_0);
+    ITensor       *final_output = tensors.get_tensor(TensorType::ACL_DST_1);
+
+    _run_method(input1, input2, bn_mul, bn_add, add_output, final_output, _policy, _act_info, window);
+}
+
+const char *CpuAddMulAddKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuAddMulAddKernel::AddMulAddKernel> &CpuAddMulAddKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h
new file mode 100644
index 0000000000..c5e31ec291
--- /dev/null
+++ b/src/cpu/kernels/CpuAddMulAddKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SRC_CPU_KERNELS_CPUADDMULADDKERNEL
+#define SRC_CPU_KERNELS_CPUADDMULADDKERNEL
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform addition between two tensors */
+class CpuAddMulAddKernel : public ICpuKernel<CpuAddMulAddKernel>
+{
+private:
+    using AddMulAddKernelPtr = std::add_pointer<void(const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     ITensor *,
+                                                     ITensor *,
+                                                     ConvertPolicy,
+                                                     const ActivationLayerInfo &,
+                                                     const Window &)>::type;
+
+public:
+    struct AddMulAddKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        AddMulAddKernelPtr           ukernel;
+    };
+
+    CpuAddMulAddKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddMulAddKernel);
+    /** Initialize the kernel's inputs and outputs.
+     *
+     * Similar to @ref NEAddMulAdd::configure()
+     *
+     */
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuAddMulAddKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    static const std::vector<AddMulAddKernel> &get_available_kernels();
+
+private:
+    ConvertPolicy       _policy{};
+    ActivationLayerInfo _act_info{};
+    AddMulAddKernelPtr  _run_method{nullptr};
+    std::string         _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CPU_KERNELS_CPUADDMULADDKERNEL */
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp
new file mode 100644
index 0000000000..05c7742b03
--- /dev/null
+++ b/src/cpu/kernels/CpuCastKernel.cpp
@@ -0,0 +1,1179 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuCastKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/cast/list.h"
+#include "support/SaturateCast.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuCastKernel::CastKernel> available_kernels = {
+    {"neon_qs8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)},
+    {"neon_qu8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_u8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_fp16_cast",
+     [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)},
+    {"neon_fp32_to_fp16_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)},
+    {"neon_s32_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)},
+};
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+#ifdef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
+                                                         DataType::F32, DataType::S32, DataType::S64, DataType::U64);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
+                                                         DataType::U32, DataType::S32, DataType::F32, DataType::S64);
+
+#else  // __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
+                                                         DataType::F32, DataType::S32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
+                                                         DataType::U32, DataType::S32, DataType::F32);
+#endif // __aarch64__
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 &&
+                                         dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+                                    "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
+                                    "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
+                                    "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 &&
+                                        (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
+                                    "Only data_types supported [in] U16 ->  [out] U8, U32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
+                                    "Only data_types supported [in] S16 ->  [out] U8, S32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
+                                    "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
+                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, F16, S32, U8");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::S64),
+                                    "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8, S64");
+#ifdef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32,
+                                    "Only data_types supported [in] S64 ->  [out] F32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32,
+                                    "Only data_types supported [in] U64 ->  [out] F32");
+#endif // __aarch64__
+
+    // Validate in case of configured dst
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
+    set_shape_if_empty(*dst, src->tensor_shape());
+
+    _policy = policy;
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    ICPPKernel::configure(win);
+}
+
+Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
+    return Status{};
+}
+#ifdef __aarch64__
+namespace
+{
+template <typename T1, typename T2>
+inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr)
+{
+    ARM_COMPUTE_UNUSED(src_ptr);
+    ARM_COMPUTE_UNUSED(dst_ptr);
+}
+
+template <>
+inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr)
+{
+    const int32x4x4_t texels = {
+        {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}};
+    vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0])));
+    vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0])));
+    vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1])));
+    vst1q_s64(dst_ptr + 6, vmovl_s32(vget_high_s32(texels.val[1])));
+    vst1q_s64(dst_ptr + 8, vmovl_s32(vget_low_s32(texels.val[2])));
+    vst1q_s64(dst_ptr + 10, vmovl_s32(vget_high_s32(texels.val[2])));
+    vst1q_s64(dst_ptr + 12, vmovl_s32(vget_low_s32(texels.val[3])));
+    vst1q_s64(dst_ptr + 14, vmovl_s32(vget_high_s32(texels.val[3])));
+}
+
+template <>
+inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr)
+{
+    const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}};
+    const float32x4x4_t texels  = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
+    vst1q_f32(dst_ptr, texels.val[0]);
+    vst1q_f32(dst_ptr + 4, texels.val[1]);
+    vst1q_f32(dst_ptr + 8, texels.val[2]);
+    vst1q_f32(dst_ptr + 12, texels.val[3]);
+}
+
+template <>
+inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr)
+{
+    const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}};
+
+    const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
+
+    vst1q_f32(dst_ptr, texels.val[0]);
+    vst1q_f32(dst_ptr + 4, texels.val[1]);
+    vst1q_f32(dst_ptr + 8, texels.val[2]);
+    vst1q_f32(dst_ptr + 12, texels.val[3]);
+}
+
+template <typename T1, typename T2>
+inline void
+convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
+{
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
+            int        x       = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
+            }
+        },
+        src, dst);
+}
+} // namespace
+#endif // __aarch64__
+
+void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16;
+
+    const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *_dst = tensors.get_tensor(TensorType::ACL_DST);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON(_src == _dst);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator src(_src, win);
+    Iterator dst(_dst, win);
+
+    /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */
+    const auto *uk = CpuCastKernel::get_implementation(
+        CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()});
+
+    switch (_src->info()->data_type())
+    {
+#ifdef __aarch64__
+        case DataType::U64:
+        {
+            switch (_dst->info()->data_type())
+            {
+                case DataType::F32:
+                {
+                    convert64<uint64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        }
+        case DataType::S64:
+        {
+            switch (_dst->info()->data_type())
+            {
+                case DataType::F32:
+                {
+                    convert64<int64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        }
+#endif // __aarch64__
+
+        case DataType::QASYMM8_SIGNED:
+        {
+            switch (_dst->info()->data_type())
+            {
+                case DataType::S16:
+                {
+                    /* Up-conversion QASYMM8_SIGNED -> S16 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+                            int        x       = window_start_x;
+
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::S32:
+                {
+                    /* Up-conversion QASYMM8_SIGNED -> S32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+                            int        x       = window_start_x;
+
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::F32:
+                {
+                    /* Up-conversion QASYMM8_SIGNED -> F32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::F16:
+                {
+                    /* Up-conversion QASYMM8_SIGNED -> F16 */
+                    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+                    uk->ukernel(_src, _dst, info, _policy, window);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        }
+
+        case DataType::QASYMM8:
+        case DataType::U8:
+        {
+            switch (_dst->info()->data_type())
+            {
+                case DataType::S16:
+                {
+                    /* Up-conversion U8 -> S16 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::S32:
+                {
+                    /* Up-conversion U8 -> S32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::F32:
+                {
+                    /* Up-conversion U8 -> F32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::F16:
+                {
+                    /* Up-conversion U8 -> FP16 */
+                    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+                    uk->ukernel(_src, _dst, info, _policy, window);
+                    break;
+                }
+                case DataType::U16:
+                {
+                    /* Up-conversion U8 -> U16 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+                                const uint16x8x2_t texels = {
+                                    {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
+
+                                vst1q_u16(dst_ptr + x, texels.val[0]);
+                                vst1q_u16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        }
+        case DataType::S16:
+        {
+            switch (_dst->info()->data_type())
+            {
+                case DataType::QASYMM8_SIGNED:
+                {
+                    /* Down-conversion S16 -> QASYMM8_SIGNED */
+                    if (ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
+
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    else
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
+
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    break;
+                }
+                case DataType::U8:
+                {
+                    /* Down-conversion S16 -> U8 */
+                    if (ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
+
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    else
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
+
+                                    vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+                                                                      vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    break;
+                }
+                case DataType::S32:
+                {
+                    /* Up-conversion S16 -> S32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
+
+                                const int32x4x4_t texels_s32 = {
+                                    {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
+                                     vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
+
+                                vst1q_s32(dst_ptr + x, texels_s32.val[0]);
+                                vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
+                                vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
+                                vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        }
+
+        case DataType::U16:
+        {
+            switch (_dst->info()->data_type())
+            {
+                case DataType::U8:
+                {
+                    /* Down-conversion U16 -> U8 */
+                    if (ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
+
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    else
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
+
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    break;
+                }
+                case DataType::U32:
+                {
+                    /* Up-conversion U16 -> U32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
+
+                                vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
+                                vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        }
+        case DataType::F16:
+        {
+            /* conversion F16 -> any data type */
+            ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+            uk->ukernel(_src, _dst, info, _policy, window);
+            break;
+        }
+        case DataType::F32:
+            switch (_dst->info()->data_type())
+            {
+                case DataType::F16:
+                {
+                    /* Down-conversion F32 -> F16 */
+                    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+                    uk->ukernel(_src, _dst, info, _policy, window);
+                    break;
+                }
+                case DataType::S32:
+                {
+                    /* Conversion F32 -> S32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const float32x4x4_t texels = {{
+                                    vld1q_f32(src_ptr + x),
+                                    vld1q_f32(src_ptr + x + 4),
+                                    vld1q_f32(src_ptr + x + 8),
+                                    vld1q_f32(src_ptr + x + 12),
+                                }};
+
+                                vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
+                                vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
+                                vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
+                                vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::QASYMM8:
+                case DataType::U8:
+                {
+                    /* Down-conversion F32 -> U8 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const float32x4x4_t texels = {{
+                                    vld1q_f32(src_ptr + x),
+                                    vld1q_f32(src_ptr + x + 4),
+                                    vld1q_f32(src_ptr + x + 8),
+                                    vld1q_f32(src_ptr + x + 12),
+                                }};
+
+                                vst1_u8(dst_ptr + x,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_u8(dst_ptr + x + 8,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::QASYMM8_SIGNED:
+                {
+                    /* Down-conversion F32 -> QASYMM8_SIGNED */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const float32x4x4_t texels = {{
+                                    vld1q_f32(src_ptr + x),
+                                    vld1q_f32(src_ptr + x + 4),
+                                    vld1q_f32(src_ptr + x + 8),
+                                    vld1q_f32(src_ptr + x + 12),
+                                }};
+
+                                vst1_s8(dst_ptr + x,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_s8(dst_ptr + x + 8,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        case DataType::S32:
+            switch (_dst->info()->data_type())
+            {
+#if __aarch64__
+                case DataType::S64:
+                {
+                    convert64<int32_t, int64_t>(src, dst, win, window_start_x, window_end_x, window_step_x);
+                    break;
+                }
+#endif // __aarch64__
+                case DataType::F16:
+                {
+                    /* Down-conversion S32 -> F16 */
+                    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+                    uk->ukernel(_src, _dst, info, _policy, window);
+                    break;
+                }
+                case DataType::F32:
+                {
+                    /* Conversion S32 -> F32 */
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
+                        {
+                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                                const int32x4x4_t texels = {{
+                                    vld1q_s32(src_ptr + x),
+                                    vld1q_s32(src_ptr + x + 4),
+                                    vld1q_s32(src_ptr + x + 8),
+                                    vld1q_s32(src_ptr + x + 12),
+                                }};
+
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
+                    break;
+                }
+                case DataType::QASYMM8_SIGNED:
+                {
+                    /* Down-conversion S32 -> QASYMM8_SIGNED */
+                    if (ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{
+                                        vld1q_s32(src_ptr + x),
+                                        vld1q_s32(src_ptr + x + 4),
+                                        vld1q_s32(src_ptr + x + 8),
+                                        vld1q_s32(src_ptr + x + 12),
+                                    }};
+                                    vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
+                                                                                 vqmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
+                                                                                     vqmovn_s32(texels.val[3]))));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    else
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+
+                                    vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
+                                                                                vmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
+                                                                                    vmovn_s32(texels.val[3]))));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    break;
+                }
+                case DataType::QASYMM8:
+                case DataType::U8:
+                {
+                    /* Down-conversion S32 -> U8 */
+                    if (ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+                                    vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
+                                                                                 vqmovun_s32(texels.val[1]))));
+                                    vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
+                                                                                     vqmovun_s32(texels.val[3]))));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    else
+                    {
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
+                            {
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+
+                                    vst1_u8(dst_ptr + x,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
+                                    vst1_u8(dst_ptr + x + 8,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
+                    }
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("dst data type not supported");
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+const char *CpuCastKernel::name() const
+{
+    return "CpuCastKernel.cpp";
+}
+
+const std::vector<CpuCastKernel::CastKernel> &CpuCastKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
new file mode 100644
index 0000000000..ddbfe1f034
--- /dev/null
+++ b/src/cpu/kernels/CpuCastKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Casts a given tensor to a new type
+ *
+ * @note When casting between quantized types the scale and zeroPoint are ignored
+ */
+class CpuCastKernel : public ICpuKernel<CpuCastKernel>
+{
+private:
+    using CastKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
+
+public:
+    CpuCastKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCastKernel);
+    /** Set the src and dst of the kernel
+     *
+     * Valid conversions src -> dst :
+     *
+     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
+     *   - QASYMM8        -> U16, S16, S32, F32, F16
+     *   - U8             -> U16, S16, S32, F32, F16
+     *   - U16            -> U8, U32
+     *   - S16            -> QASYMM8_SIGNED, U8, S32
+     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
+     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
+     *   - S64            -> F32
+     *   - F32            -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8
+     *
+     * @param[in]  src    The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/F16/F32.
+     * @param[out] dst    The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/S64/F16/F32.
+     * @param[in]  policy Conversion policy.
+     *
+     * @note S64 is only supported in aarch64
+     *
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuCastKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct CastKernel
+    {
+        const char                          *name;
+        const CastDataTypeISASelectorDataPtr is_selected;
+        CastKernelPtr                        ukernel;
+    };
+
+    static const std::vector<CastKernel> &get_available_kernels();
+
+private:
+    ConvertPolicy _policy{ConvertPolicy::SATURATE};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H
diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp
new file mode 100644
index 0000000000..a52a1f58ea
--- /dev/null
+++ b/src/cpu/kernels/CpuCol2ImKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuCol2ImKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims)
+{
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    // Validate configured output
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_col2im_shape(*src, convolved_dims, false));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuCol2ImKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims));
+
+    _convolved_dims = convolved_dims;
+
+    // Configure kernel window
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, false)));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *output, const Size2D &convolved_dims)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, output, convolved_dims));
+    return Status{};
+}
+
+void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const uint8_t el_size         = src->info()->element_size();
+    const int     output_stride_x = dst->info()->strides_in_bytes().x();
+    const int     output_stride_y = dst->info()->strides_in_bytes().y();
+    const int     output_stride_z = dst->info()->strides_in_bytes().z();
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Create iterators
+    Iterator in(src, window);
+    Iterator out(dst, window_out);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const int hidx = id.y();
+            const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y +
+                            (hidx % _convolved_dims.width) * output_stride_x;
+            std::memcpy(out.ptr() + idx, in.ptr(), el_size);
+        },
+        in, out);
+}
+
+const char *CpuCol2ImKernel::name() const
+{
+    return "CpuCol2ImKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h
new file mode 100644
index 0000000000..3e394ac914
--- /dev/null
+++ b/src/cpu/kernels/CpuCol2ImKernel.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_COL2IM_KERNEL_H
+#define ARM_COMPUTE_CPU_COL2IM_KERNEL_H
+
+#include "arm_compute/core/Size2D.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to perform col2im reshaping.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CpuIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CpuCol2ImKernel : public ICpuKernel<CpuCol2ImKernel>
+{
+public:
+    /** Default constructor */
+    CpuCol2ImKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCol2ImKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  src            The input tensor info to convert. Data types supported: All
+     * @param[out] dst            The output tensor info. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuCol2ImKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    Size2D _convolved_dims{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_COL2IM_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
new file mode 100644
index 0000000000..8c290173e8
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+template <typename T>
+void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, const Window &window)
+{
+    // Offset src
+    uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    // Offset dst
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       batch_offset * dst->info()->strides_in_bytes()[3];
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16 / dst->info()->element_size();
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
+
+    Iterator src_it(src, win);
+    Iterator dst_it(dst, win);
+
+    const DataType                dt        = src->info()->data_type();
+    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
+    }
+}
+
+Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
+
+    return Status{};
+}
+} // namespace
+
+void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
+
+    _func         = nullptr;
+    _batch_offset = batch_offset;
+
+    switch (src->data_type())
+    {
+        case DataType::S8:
+        case DataType::U8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+            _func = &batch_concat<uint8_t>;
+            break;
+        case DataType::S16:
+        case DataType::U16:
+        case DataType::F16:
+            _func = &batch_concat<uint16_t>;
+            break;
+        case DataType::S32:
+        case DataType::U32:
+        case DataType::F32:
+            _func = &batch_concat<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuConcatenateBatchKernel::validate(const arm_compute::ITensorInfo *src,
+                                           unsigned int                    batch_offset,
+                                           const arm_compute::ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
+    return Status{};
+}
+
+void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset,
+             window);
+}
+
+const char *CpuConcatenateBatchKernel::name() const
+{
+    return "CpuConcatenateBatchKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h
new file mode 100644
index 0000000000..52ea553a7d
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H
+#define ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the batch concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CpuConcatenateBatchKernel : public ICpuKernel<CpuConcatenateBatchKernel>
+{
+public:
+    CpuConcatenateBatchKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateBatchKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]     src          Source tensor info. Data types supported: All.
+     * @param[in]     batch_offset The offset on axis # 3.
+     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
+     */
+    void configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuConcatenateBatchKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
+
+private:
+    BatchConcatFunction *_func{nullptr};
+    unsigned int         _batch_offset{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
new file mode 100644
index 0000000000..c75e1e4477
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+template <typename T>
+void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, const Window &window)
+{
+    // Offset source
+    uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    // Offset destination
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       depth_offset * dst->info()->strides_in_bytes()[2];
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16 / dst->info()->element_size();
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
+
+    Iterator src_it(src, win);
+    Iterator dst_it(dst, win);
+
+    const DataType                dt        = src->info()->data_type();
+    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
+    }
+}
+
+Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    return Status{};
+}
+} // namespace
+
+void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
+
+    _func         = nullptr;
+    _depth_offset = depth_offset;
+
+    switch (src->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &depth_concat<uint8_t>;
+            break;
+        case DataType::QASYMM8_SIGNED:
+            _func = &depth_concat<int8_t>;
+            break;
+        case DataType::F16:
+            _func = &depth_concat<uint16_t>;
+            break;
+        case DataType::F32:
+            _func = &depth_concat<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuConcatenateDepthKernel::validate(const arm_compute::ITensorInfo *src,
+                                           unsigned int                    depth_offset,
+                                           const arm_compute::ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
+    return Status{};
+}
+
+void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset,
+             window);
+}
+
+const char *CpuConcatenateDepthKernel::name() const
+{
+    return "CpuConcatenateDepthKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h
new file mode 100644
index 0000000000..54de9aff46
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H
+#define ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CpuConcatenateDepthKernel : public ICpuKernel<CpuConcatenateDepthKernel>
+{
+public:
+    CpuConcatenateDepthKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateDepthKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]     src          Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuConcatenateDepthKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
+
+private:
+    DepthConcatFunction *_func{nullptr};
+    unsigned int         _depth_offset{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
new file mode 100644
index 0000000000..b6c11d948b
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuConcatenateHeightKernel::configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_UNUSED(src);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
+
+    _height_offset = height_offset;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuConcatenateHeightKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
+    return Status{};
+}
+
+void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    // Offset destination pointer to the correct position
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
+    const int  window_step_x  = 16;
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
+
+    // Create iterators
+    Iterator src_it(src, win);
+    Iterator dst_it(dst, win);
+
+    const DataType                 dt        = src->info()->data_type();
+    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo),
+                                         dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
+    }
+}
+
+const char *CpuConcatenateHeightKernel::name() const
+{
+    return "CpuConcatenateHeightKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h
new file mode 100644
index 0000000000..df880c4878
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H
+#define ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the height concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class CpuConcatenateHeightKernel : public ICpuKernel<CpuConcatenateHeightKernel>
+{
+public:
+    CpuConcatenateHeightKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateHeightKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]     src           Source tensor info. Data types supported: All
+     * @param[in]     height_offset The starting offset on the Y axis for the output tensor.
+     * @param[in,out] dst           Destination tensor info. Data types supported: Same as @p src.
+     *
+     */
+    void configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuConcatenateHeightKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    unsigned int _height_offset{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
new file mode 100644
index 0000000000..f6100cccca
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
+
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuConcatenateWidthKernel::configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
+    ARM_COMPUTE_UNUSED(dst);
+
+    _width_offset = width_offset;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuConcatenateWidthKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
+    return Status{};
+}
+
+void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    // Offset output pointer to the correct position
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _width_offset * dst->info()->strides_in_bytes()[0];
+
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
+    constexpr int window_step_x  = 16;
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator                       src_it(src, win);
+    Iterator                       dst_it(dst, win);
+    const DataType                 dt        = src->info()->data_type();
+    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo),
+                                         dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
+    }
+}
+
+const char *CpuConcatenateWidthKernel::name() const
+{
+    return "CpuConcatenateWidthKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h
new file mode 100644
index 0000000000..560e44e35a
--- /dev/null
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H
+#define ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class CpuConcatenateWidthKernel : public ICpuKernel<CpuConcatenateWidthKernel>
+{
+public:
+    CpuConcatenateWidthKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateWidthKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]     src          Source tensor info. Data types supported: All
+     * @param[in]     width_offset The offset on the X axis.
+     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
+     */
+    void configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuConcatenateWidthKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    unsigned int _width_offset{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
new file mode 100644
index 0000000000..87703ec631
--- /dev/null
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src,
+                                                      ITensorInfo       *dst,
+                                                      const TensorShape &original_input_shape,
+                                                      DataLayout         data_layout)
+
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto initialisation if not yet initialized
+    auto_init_if_empty(*dst, *src->clone());
+
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
+
+    const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
+
+    const int width_idx   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
+    const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
+
+    const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
+    const unsigned int num_channels              = original_input_shape[channel_idx];
+
+    _factor1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels;
+    _factor2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                       const ITensorInfo *dst,
+                                                       const TensorShape &original_input_shape,
+                                                       DataLayout         data_layout)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_input_shape.total_size_lower(3));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
+    // Checks performed when dst is configured
+    if ((dst != nullptr) && (dst->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+
+void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const unsigned int dst_stride_x = dst->info()->strides_in_bytes().x();
+    const unsigned int dst_stride_y = dst->info()->strides_in_bytes().y();
+    const unsigned int element_size = src->info()->element_size();
+
+    Iterator input(src, window);
+    Iterator output(dst, window);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            memcpy(output.ptr() + id.x() * dst_stride_x +
+                       (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y,
+                   input.ptr(), element_size);
+        },
+        input);
+}
+
+const char *CpuConvertFullyConnectedWeightsKernel::name() const
+{
+    return "CpuConvertFullyConnectedWeightsKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
new file mode 100644
index 0000000000..2253889e69
--- /dev/null
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
+#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
+ *
+ * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
+ *       - It follows a Convolution layer
+ *       - The data layout used by the network does not match the one the model has been trained in.
+ *
+ * @note This function assumes the weights are already reshaped (transposed)
+ */
+class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel<CpuConvertFullyConnectedWeightsKernel>
+{
+public:
+    CpuConvertFullyConnectedWeightsKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertFullyConnectedWeightsKernel);
+    /** Set the src and dst tensor.
+     *
+     * @param[in] src                  Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
+     * @param[in] dst                  The converted weights tensor info. Shape and Data Type: Same as @p src.
+     * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
+     * @param[in] data_layout          The data layout the weights have been trained in.
+     */
+    void configure(const ITensorInfo *src,
+                   ITensorInfo       *dst,
+                   const TensorShape &original_input_shape,
+                   DataLayout         data_layout);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    unsigned int _factor1{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
+    unsigned int _factor2{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
new file mode 100644
index 0000000000..745b1566c2
--- /dev/null
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+
+    // Validate output if initialized
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
+{
+    // Output auto inizialitation if not yet initialized
+    {
+        const bool                    is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED;
+        const DataType                dt              = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qinfo           = src->quantization_info().uniform();
+        const int                     offset_correction = is_input_signed ? -128 : 128;
+        const QuantizationInfo        corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+
+        auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
+    }
+
+    return std::make_pair(Status{}, calculate_max_window(*dst));
+}
+} // namespace
+
+void CpuConvertQuantizedSignednessKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    std::pair<Status, Window> win_config = validate_and_configure_window(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICpuKernel::configure(win_config.second);
+}
+
+Status CpuConvertQuantizedSignednessKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const uint8_t mask  = 128;
+    const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+                *(output_ptr + x) = in ^ mask;
+            }
+        },
+        input, output);
+}
+
+const char *CpuConvertQuantizedSignednessKernel::name() const
+{
+    return "CpuConvertQuantizedSignednessKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
new file mode 100644
index 0000000000..e94d3d5ef2
--- /dev/null
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H
+#define ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
+class CpuConvertQuantizedSignednessKernel : public ICpuKernel<CpuConvertQuantizedSignednessKernel>
+{
+public:
+    CpuConvertQuantizedSignednessKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertQuantizedSignednessKernel);
+    /** Initialize the kernel input and output info.
+     *
+     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst Destination tensor info. Data types supported: opposite of @p src.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuConvertQuantizedSignednessKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H */
diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp
new file mode 100644
index 0000000000..1b693d7a3a
--- /dev/null
+++ b/src/cpu/kernels/CpuCopyKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuCopyKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList())
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
+
+    // Validate destination if initialized
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
+{
+    // Destination auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, *src);
+    return std::make_pair(Status{}, calculate_max_window(*dst));
+}
+
+std::pair<Status, Window>
+validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
+{
+    const TensorShape src_shape    = src->tensor_shape();
+    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(padded_shape));
+    // Configure window
+    const Window win = calculate_max_window(*dst, dst->dimension(0));
+    return std::make_pair(Status{}, win);
+}
+
+} // namespace
+
+void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, padding));
+
+    _padding = padding;
+
+    std::pair<Status, Window> win_config;
+    if (padding.empty())
+    {
+        win_config = validate_and_configure_window(src, dst);
+    }
+    else
+    {
+        win_config = validate_and_configure_window_with_padding(src, dst, padding);
+    }
+
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICpuKernel::configure(win_config.second);
+}
+
+Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src,
+                               const arm_compute::ITensorInfo *dst,
+                               const PaddingList              &padding)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
+
+    if (padding.empty())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
+    }
+
+    return Status{};
+}
+
+void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    if (_padding.empty())
+    {
+        Window dst_window{window};
+        dst_window.set(Window::DimX,
+                       Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
+        Window out_slice = dst_window.first_slice_window_1D();
+        do
+        {
+            Iterator src_it(src, out_slice);
+            Iterator dst_it(dst, out_slice);
+
+            execute_window_loop(
+                out_slice,
+                [&](const Coordinates &)
+                { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); },
+                src_it, dst_it);
+        } while (dst_window.slide_window_slice_1D(out_slice));
+    }
+    else
+    {
+        Window src_window{window};
+        src_window.set(Window::DimX,
+                       Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
+
+        Iterator     src_it(src, src_window);
+        Iterator     dst_it(dst, window);
+        const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
+        execute_window_loop(
+            window,
+            [&](const Coordinates &)
+            {
+                auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
+                std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
+            },
+            src_it, dst_it);
+    }
+}
+
+const char *CpuCopyKernel::name() const
+{
+    return "CpuCopyKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h
new file mode 100644
index 0000000000..a05053f07e
--- /dev/null
+++ b/src/cpu/kernels/CpuCopyKernel.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_COPY_KERNEL_H
+#define ARM_COMPUTE_CPU_COPY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to perform a copy between two tensors */
+class CpuCopyKernel : public ICpuKernel<CpuCopyKernel>
+{
+public:
+    CpuCopyKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCopyKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]  src     Source tensor. Data types supported: All
+     * @param[out] dst     Destination tensor. Data types supported: same as @p src.
+     * @param[in]  padding (Optional) Padding to be applied to the input tensor
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding = PaddingList());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuCopyKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    PaddingList _padding{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_COPY_KERNEL_H */
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
new file mode 100644
index 0000000000..82e3a5ce00
--- /dev/null
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/cpu/kernels/depthwiseconv2d/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = {
+    {"neon_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)},
+    {"neon_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)},
+    {"neon_fp16_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)},
+    {"neon_fp32_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)},
+    {"neon_qp8_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)},
+    {"neon_qp8_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)},
+};
+
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *weights,
+                          const ITensorInfo     *biases,
+                          const ITensorInfo     *dst,
+                          const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) ||
+                                (info.pad_stride_info.stride().second < 1));
+
+    if (is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+
+    if (biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
+
+        if (is_data_type_quantized_asymmetric(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+    }
+
+    if (dst->total_size() != 0)
+    {
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo     *src,
+                                               const ITensorInfo     *weights,
+                                               const ITensorInfo     *biases,
+                                               ITensorInfo           *dst,
+                                               const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
+
+    _has_biases = (biases != nullptr);
+    _conv_info  = info;
+
+    const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation(
+        DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+    _func = uk->ukernel;
+
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_is_resizable(true)
+                                 .reset_padding()
+                                 .set_tensor_shape(output_shape)
+                                 .set_quantization_info(dst->quantization_info()));
+
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo     *src,
+                                                const ITensorInfo     *weights,
+                                                const ITensorInfo     *biases,
+                                                const ITensorInfo     *dst,
+                                                const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
+    return Status{};
+}
+
+void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
+    _func(src, weights, biases, dst, window, _has_biases, _conv_info);
+}
+
+const char *CpuDepthwiseConv2dNativeKernel::name() const
+{
+    return "CpuDepthwiseConv2dNativeKernel";
+}
+
+const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &
+CpuDepthwiseConv2dNativeKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
new file mode 100644
index 0000000000..7e78f52e13
--- /dev/null
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H
+
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+#include "support/AclRequires.h"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_neon.h>
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to run a depthwise convolution native on a tensor. */
+class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel>
+{
+private:
+    using DepthwiseConv2dNativeKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::
+        type;
+
+public:
+    CpuDepthwiseConv2dNativeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
+
+    /** Initialize the function's source, destination and parameters.
+     *
+     * @note Supported data layouts: NHWC
+     *
+     * @param[in]  src     Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
+     *                     Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst     Destination tensor. Data type supported: Same as @p src.
+     * @param[in]  info    Depthwise convolution meta-data.
+     *
+     */
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2dNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+    struct DepthwiseConv2dNativeKernel
+    {
+        const char                                       *name;
+        const DepthwiseConv2dNativeDataTypeISASelectorPtr is_selected;
+        DepthwiseConv2dNativeKernelPtr                    ukernel;
+    };
+    static const std::vector<DepthwiseConv2dNativeKernel> &get_available_kernels();
+
+private:
+    /** Common signature for all the specialised depthwise convolution native functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    DepthwiseConv2dNativeKernelPtr _func{nullptr};
+    ConvolutionInfo                _conv_info{};
+    bool                           _has_biases{false};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp
new file mode 100644
index 0000000000..5595ace998
--- /dev/null
+++ b/src/cpu/kernels/CpuDequantizeKernel.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuDequantizeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/dequantize/generic/neon/list.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
+
+    if (dst->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
+
+    ICpuKernel::configure(win);
+
+    switch (dst->data_type())
+    {
+        case DataType::F32:
+            _func = REGISTER_FP32_NEON(fp32_run_dequantization_core);
+            break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        case DataType::F16:
+            _func = REGISTER_FP16_NEON(fp16_run_dequantization_core);
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+}
+
+Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+    _func(src, dst, window);
+}
+const char *CpuDequantizeKernel::name() const
+{
+    return "CpuDequantizeKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h
new file mode 100644
index 0000000000..d8b6444f0a
--- /dev/null
+++ b/src/cpu/kernels/CpuDequantizeKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the dequantization layer kernel. */
+class CpuDequantizeKernel : public ICpuKernel<CpuDequantizeKernel>
+{
+public:
+    CpuDequantizeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel);
+    /** Set input, output tensors.
+     *
+     * @param[in]  src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuDequantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Common signature for all the specialised @ref CpuDequantizeKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DequantizeFunctionExecutorPtr = void (*)(const ITensor *input, ITensor *output, const Window &window);
+    DequantizeFunctionExecutorPtr _func{nullptr};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
new file mode 100644
index 0000000000..4cb0fb1c40
--- /dev/null
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/directconv2d/list.h"
+
+using namespace arm_compute::detail;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = {
+    {"neon_fp32_nhwc_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)},
+    {"neon_fp32_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)},
+    {"neon_fp16_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)},
+};
+
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+
+    const DataLayout data_layout = src->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
+    ARM_COMPUTE_UNUSED(width_idx);
+    // Checks performed when output is configured
+    if (dst->total_size() != 0)
+    {
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+        DataType data_type = src->data_type();
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_UNUSED(src);
+
+    Window win{};
+    bool   window_changed = false;
+
+    // Configure window without any padding
+    win = calculate_max_window(*dst, Steps());
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+void CpuDirectConv2dKernel::configure(ITensorInfo         *src,
+                                      ITensorInfo         *weights,
+                                      ITensorInfo         *dst,
+                                      const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    _conv_info   = conv_info;
+    _data_layout = src->data_layout();
+    _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
+
+    // Get convolved dimensions
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+    DataType data_type = src->data_type();
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape, 1, data_type);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICpuKernel::configure(win_config.second);
+}
+
+Status CpuDirectConv2dKernel::validate(const ITensorInfo   *src,
+                                       const ITensorInfo   *weights,
+                                       const ITensorInfo   *dst,
+                                       const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
+
+    return Status{};
+}
+
+void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst     = tensors.get_tensor(TensorType::ACL_DST);
+
+    const auto *uk = CpuDirectConv2dKernel::get_implementation(
+        DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+    uk->ukernel(window, src, weights, dst, _conv_info);
+}
+const char *CpuDirectConv2dKernel::name() const
+{
+    return "CpuDirectConvolutionLayerKernel";
+}
+
+const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> &CpuDirectConv2dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
new file mode 100644
index 0000000000..ad4caea193
--- /dev/null
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform Direct Convolution Layer. */
+class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
+{
+private:
+    using DirectConv2dKernel_Ptr = std::add_pointer<void(
+        const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
+
+public:
+    CpuDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
+    /** Set the src, weights, and dst tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *
+     * @param[in]  src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                       Data type supported:Same as @p input.
+     * @param[out] dst       Output tensor.
+     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *dst,
+                           const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct DirectConv2dKernel
+    {
+        const char                         *name;
+        const DataTypeDataLayoutSelectorPtr is_selected;
+        DirectConv2dKernel_Ptr              ukernel;
+    };
+
+    static const std::vector<DirectConv2dKernel> &get_available_kernels();
+
+private:
+    PadStrideInfo _conv_info{};
+    unsigned int  _kernel_size{0};
+    DataLayout    _data_layout{DataLayout::UNKNOWN};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
new file mode 100644
index 0000000000..d4af8bedaf
--- /dev/null
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo                                 *src,
+                          const ITensorInfo                                 *bias,
+                          const ITensorInfo                                 *dst,
+                          const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(
+                                                              src->data_layout(), DataLayoutDimension::CHANNEL)));
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+    }
+
+    if (src->data_type() == DataType::S32)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
+    }
+
+    // Checks performed when output is configured
+    if ((dst != nullptr) && (dst->total_size() != 0))
+    {
+        if (is_data_type_float(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+    else if (src->data_type() == DataType::S32)
+    {
+        // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
+        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) &&
+                                    (info.output_data_type != DataType::QASYMM8_SIGNED));
+    }
+
+    return Status{};
+}
+
+template <typename T>
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+output_stage_nchw(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+                auto       v_in   = wrapper::vloadq(in_ptr);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                    v_in = wrapper::vadd(v_in, vb);
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, v_in);
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
+
+                *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+output_stage_nhwc(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+                auto       v_in   = wrapper::vloadq(in_ptr + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                wrapper::vstore(out_ptr + x, v_in);
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                *(out_ptr + x)     = s_in;
+            }
+        },
+        in, bi, out);
+}
+
+// Quantized case
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nchw(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8),
+                                     wrapper::vloadq(in_ptr + 12)}};
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                    v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb),
+                             wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}};
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+            }
+        },
+        in, out);
+}
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nhwc(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in   = {{
+                      wrapper::vloadq(in_ptr),
+                      wrapper::vloadq(in_ptr + 4),
+                      wrapper::vloadq(in_ptr + 8),
+                      wrapper::vloadq(in_ptr + 12),
+                }};
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+
+                    wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                    wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                    wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                    wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32_t    s_in   = *in_ptr;
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+            }
+        },
+        in, bi, out);
+}
+} // namespace
+
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo                                       *src,
+                                                 const ITensorInfo                                 *bias,
+                                                 ITensorInfo                                       *dst,
+                                                 const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(bias);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+    _func                         = nullptr;
+    _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
+    _result_shift                 = info.result_shift;
+    _result_offset_after_shift    = info.result_offset_after_shift;
+
+    // Auto-initialize output output if required
+    if (dst != nullptr)
+    {
+        // Work out expected output data type
+        const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
+    }
+
+    Window win = calculate_max_window(*src, Steps());
+
+    ICpuKernel::configure(win);
+
+    const bool is_qasymm8_signed =
+        (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
+
+    // Set appropriate function
+    if (src->data_layout() == DataLayout::NCHW)
+    {
+        switch (src->data_type())
+        {
+            case DataType::S32:
+            {
+                if (is_qasymm8_signed)
+                {
+                    _func = &output_stage_nchw<int8_t>;
+                }
+                else
+                {
+                    _func = &output_stage_nchw<uint8_t>;
+                }
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                _func = &output_stage_nchw<float16_t>;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+            {
+                _func = &output_stage_nchw<float>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
+        }
+    }
+    else
+    {
+        switch (src->data_type())
+        {
+            case DataType::S32:
+            {
+                if (is_qasymm8_signed)
+                {
+                    _func = &output_stage_nhwc<int8_t>;
+                }
+                else
+                {
+                    _func = &output_stage_nhwc<uint8_t>;
+                }
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                _func = &output_stage_nhwc<float16_t>;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+            {
+                _func = &output_stage_nhwc<float>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
+        }
+    }
+}
+
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo                                 *src,
+                                                  const ITensorInfo                                 *bias,
+                                                  const ITensorInfo                                 *dst,
+                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+    return Status{};
+}
+
+void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
+}
+
+const char *CpuDirectConv2dOutputStageKernel::name() const
+{
+    return "CpuDirectConv2dOutputStageKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
new file mode 100644
index 0000000000..ce84f49cf6
--- /dev/null
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
+ *
+ * @note We assume bias to be shared
+ * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
+ *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
+ */
+class CpuDirectConv2dOutputStageKernel : public ICpuKernel<CpuDirectConv2dOutputStageKernel>
+{
+public:
+    CpuDirectConv2dOutputStageKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] src  Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
+     *                      Data type supported: F16/F32/S32
+     * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
+     * @param[out]     dst  (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
+     *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
+     * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+     */
+    void
+    configure(ITensorInfo                                       *src,
+              const ITensorInfo                                 *bias = nullptr,
+              ITensorInfo                                       *dst  = nullptr,
+              const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv2dOutputStageKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo                                 *src,
+             const ITensorInfo                                 *bias = nullptr,
+             const ITensorInfo                                 *dst  = nullptr,
+             const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    using OutputStageKernel = void(ITensor       *src,
+                                   const ITensor *bias,
+                                   const Window  &window,
+                                   ITensor       *dst,
+                                   int            result_fixedpoint_multiplier,
+                                   int            result_shift,
+                                   int            result_offset_after_shift);
+
+    OutputStageKernel *_func{nullptr};
+    int                _result_fixedpoint_multiplier{0};
+    int                _result_shift{0};
+    int                _result_offset_after_shift{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
new file mode 100644
index 0000000000..b5b2aed1ba
--- /dev/null
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuDirectConv3dKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/conv3d/neon/list.h"
+
+#include <algorithm>
+
+using namespace arm_compute::detail;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = {
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    {"neon_fp16_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)},
+#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+    {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)},
+    {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)},
+    {"neon_qasymm8_signed_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}};
+
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U));
+
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
+
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    const DataLayout data_layout = src0->data_layout();
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    // Weight layout is D, H, W, Cin, Cout
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx));
+
+    if (src2 != nullptr)
+    {
+        if (is_data_type_quantized(src0->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // Checks performed when output is configured
+    if (dst->total_size() != 0)
+    {
+        TensorShape output_shape =
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+
+        DataType data_type = src0->data_type();
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuDirectConv3dKernel::configure(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      ITensorInfo       *dst,
+                                      const Conv3dInfo  &conv_info)
+{
+    ARM_COMPUTE_UNUSED(src2);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+    _conv_info  = conv_info;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuDirectConv3dKernel").append("/").append(uk->name);
+
+    // Get convolved dimensions
+    TensorShape output_shape =
+        misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+
+    DataType data_type = src0->data_type();
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape, 1, data_type);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, conv_info));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst,
+                                       const Conv3dInfo  &conv_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info));
+
+    return Status{};
+}
+
+void CpuDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    _run_method(src0, src1, src2, dst, _conv_info, window);
+}
+
+const char *CpuDirectConv3dKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> &CpuDirectConv3dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h
new file mode 100644
index 0000000000..8e6f564679
--- /dev/null
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H
+
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform 3D Direct Convolution Layer. */
+class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel>
+{
+private:
+    /* Template function for convolution 3d NDHWC */
+    using DirectConv3dKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
+
+public:
+    CpuDirectConv3dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv3dKernel);
+    /** Set the src, weights, biases and dst tensor info.
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in, out] src0      Input tensor info.
+     * @param[in]      src1      Set of kernels to convolve the input volume.
+     *                           The 2nd dimension must be the same as the input's volume 1st dimension.
+     * @param[in]      src2      Set of biases. Can be nullptr.
+     * @param[out]     dst       Output tensor info.
+     *                           The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor.
+     * @param[in]      conv_info Contains padding, stride, acitvation information.
+     *
+     */
+    void configure(const ITensorInfo *src0,
+                   const ITensorInfo *src1,
+                   const ITensorInfo *src2,
+                   ITensorInfo       *dst,
+                   const Conv3dInfo  &conv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv3dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv_info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct DirectConv3dKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        DirectConv3dKernelPtr        ukernel;
+    };
+
+    static const std::vector<DirectConv3dKernel> &get_available_kernels();
+
+private:
+    Conv3dInfo            _conv_info{};
+    DirectConv3dKernelPtr _run_method{nullptr};
+    std::string           _name{};
+};
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV3D_KERNEL_H */
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
new file mode 100644
index 0000000000..57a3f39822
--- /dev/null
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuElementwiseKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/elementwise_binary/list.h"
+
+#include <arm_neon.h>
+
+#if defined(ENABLE_FP32_KERNELS)
+namespace
+{
+static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+static constexpr size_t default_div_mws_N1_fp32_neon     = 19043;
+static constexpr size_t default_div_mws_V1_fp32_neon     = 25511;
+} // namespace
+#endif /* ENABLE_FP32_KERNELS */
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+template <ArithmeticOperation op>
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = {
+    {"sve2_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)},
+    {"sve2_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)},
+    {"sve_fp32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)},
+    {"sve_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)},
+    {"sve_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)},
+    {"sve_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)},
+    {"neon_fp32_arithmetic",
+
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)},
+    {"neon_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)},
+    {"neon_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)},
+    {"neon_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)},
+    {"neon_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)},
+    {"neon_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)},
+};
+template <ComparisonOperation op>
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = {
+    {"sve2_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)},
+    {"sve2_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"sve_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)},
+    {"sve_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)},
+    {"sve_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)},
+    {"sve_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)},
+    {"sve_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)},
+    {"neon_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)},
+    {"neon_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)},
+    {"neon_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)},
+    {"neon_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)},
+    {"neon_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)},
+    {"neon_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"neon_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)},
+};
+} // namespace
+
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &
+CpuArithmeticKernel::get_available_kernels()
+{
+    static std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels;
+    std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
+
+    return available_kernels;
+}
+
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &
+CpuComparisonKernel::get_available_kernels()
+{
+    static std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels;
+    std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(),
+              available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(),
+              available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(),
+              available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::LessEqual>.end(),
+              std::back_inserter(available_kernels));
+
+    return available_kernels;
+}
+
+template <class Derived>
+Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0,
+                                                                const ITensorInfo &src1,
+                                                                const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for output");
+    }
+
+    return Status{};
+}
+
+void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    const auto *uk = CpuArithmeticKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
+
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuArithmeticKernel").append("/").append(uk->name);
+
+    // If any of shapes is dynamic, expect a configured window and dst at run-time.
+    if (src0->is_dynamic() || src1->is_dynamic())
+    {
+        return;
+    }
+
+    auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape());
+    auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type());
+    ICpuKernel::configure(shape_and_window.second);
+}
+
+void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    const auto *uk = CpuComparisonKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
+
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuComparisonKernel").append("/").append(uk->name);
+
+    // If any of shapes is dynamic, expect a configured window and dst at run-time.
+    if (src0->is_dynamic() || src1->is_dynamic())
+    {
+        return;
+    }
+
+    auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape());
+    auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type());
+    ICpuKernel::configure(shape_and_window.second);
+}
+
+template <class Derived>
+void CpuElementwiseKernel<Derived>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    _run_method(src0, src1, dst, window);
+}
+template void
+CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+
+template <class Derived>
+const char *CpuElementwiseKernel<Derived>::name() const
+{
+    return _name.c_str();
+}
+template const char *CpuElementwiseKernel<CpuArithmeticKernel>::name() const;
+template const char *CpuElementwiseKernel<CpuComparisonKernel>::name() const;
+
+/** Arithmetic operators (min, max, squared_diff) */
+void CpuArithmeticKernel::configure(ArithmeticOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    _op = op;
+    CpuArithmeticKernel::configure_common(src0, src1, dst);
+}
+
+Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
+    }
+    return validate_arguments_common(src0, src1, dst);
+}
+
+Status CpuArithmeticKernel::validate(ArithmeticOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
+    return Status{};
+}
+
+size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> ||
+        this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if (platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_min_max_mws_N1_fp32_neon;
+        }
+        else if (platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_min_max_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if (this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else  /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
+/** The division operator */
+
+void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    _op = ArithmeticOperation::DIV;
+    CpuArithmeticKernel::configure_common(src0, src1, dst);
+}
+
+size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if (platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_div_mws_N1_fp32_neon;
+        }
+        else if (platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_div_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if (this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else  /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
+Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);
+    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
+}
+
+Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
+    return Status{};
+}
+
+/** The power operator */
+void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    _op = ArithmeticOperation::POWER;
+    CpuArithmeticKernel::configure_common(src0, src1, dst);
+}
+
+Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32);
+    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
+}
+
+Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
+    return Status{};
+}
+
+/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
+void CpuComparisonKernel::configure(ComparisonOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    _op = op;
+    CpuComparisonKernel::configure_common(src0, src1, dst);
+}
+
+Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
+    }
+    return validate_arguments_common(src0, src1, dst);
+}
+
+Status CpuComparisonKernel::validate(ComparisonOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
+    return Status{};
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
new file mode 100644
index 0000000000..1f3e613b80
--- /dev/null
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for an element-wise operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f]
+ *
+ */
+template <class Derived>
+class CpuElementwiseKernel : public ICpuKernel<Derived>
+{
+private:
+    using ElementwiseKernelPtr =
+        std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+
+public:
+    CpuElementwiseKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel);
+
+    using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &);
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+    const char *name() const override;
+
+    struct ElementwiseKernel
+    {
+        const char                             *name;
+        const ElementwiseDataTypeISASelectorPtr is_selected;
+        ElementwiseKernelPtr                    ukernel;
+    };
+
+protected:
+    /** Validate the argument passed to the kernel
+     *
+     * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor. Data types supported: Dependent on subclass.
+     */
+    static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
+
+protected:
+    ElementwiseKernelPtr _run_method{nullptr};
+    std::string          _name{};
+};
+
+class CpuArithmeticKernel : public CpuElementwiseKernel<CpuArithmeticKernel>
+{
+public:
+    CpuArithmeticKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  op   Arithmetic operation to be executed.
+     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
+     */
+    void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuArithmeticKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+    static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels();
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+protected:
+    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
+     */
+    void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
+
+    ArithmeticOperation _op{};
+};
+
+class CpuDivisionKernel : public CpuArithmeticKernel
+{
+public:
+    CpuDivisionKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  src0 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDivisionKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+protected:
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
+};
+
+class CpuPowerKernel : public CpuArithmeticKernel
+{
+public:
+    CpuPowerKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  src0 First tensor input info. Data types supported: F16/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPowerKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+protected:
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
+};
+
+class CpuComparisonKernel : public CpuElementwiseKernel<CpuComparisonKernel>
+{
+public:
+    CpuComparisonKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  op   Comparison operation to be executed.
+     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: U8.
+     */
+    void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuComparisonKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+    static const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &get_available_kernels();
+
+protected:
+    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
+     */
+    void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
+
+private:
+    /** Function to get the micro kernel implementation
+     *
+     * @param[in] src0 First input tensor information
+     * @param[in] src1 Second input tensor information
+     * @param[in] dst  Output tensor information
+     *
+     * @return the function instance for the micro kernel
+     */
+
+    ComparisonOperation _op{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
new file mode 100644
index 0000000000..88545ee756
--- /dev/null
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/elementwise_unary/list.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+#ifdef __aarch64__
+
+std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON(src->data_type() != dst->data_type());
+    ARM_COMPUTE_ERROR_ON(!is_data_type_quantized(src->data_type()));
+    ARM_COMPUTE_ERROR_ON(src->element_size() != 1);
+
+    auto       lut       = std::unique_ptr<uint8_t[]>(new uint8_t[256]);
+    const auto is_signed = src->data_type() == DataType::QASYMM8_SIGNED;
+    const auto src_qi    = src->quantization_info().uniform();
+    const auto dst_qi    = dst->quantization_info().uniform();
+
+    const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale;
+    const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale;
+
+    for (int i = 0; i < 256; ++i)
+    {
+        const auto in =
+            (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
+        float result = 0;
+
+        switch (op)
+        {
+            case ElementWiseUnary::RSQRT:
+                result = 1 / sqrt(in);
+                break;
+
+            case ElementWiseUnary::EXP:
+                result = std::exp(in);
+                break;
+
+            case ElementWiseUnary::NEG:
+                result = -in;
+                break;
+
+            case ElementWiseUnary::LOG:
+                result = std::log(in);
+                break;
+
+            case ElementWiseUnary::ABS:
+                result = std::abs(in);
+                break;
+
+            case ElementWiseUnary::ROUND:
+                result = support::cpp11::nearbyint(in);
+                break;
+
+            case ElementWiseUnary::SIN:
+                result = std::sin(in);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+        }
+
+        result = utility::clamp(result, dst_min_fp, dst_max_fp);
+
+        const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi))
+                                     : quantize_qasymm8(result, dst_qi);
+        lut[i]         = out;
+    }
+
+    return lut;
+}
+
+#endif // __aarch64__
+
+static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = {
+    {
+        "sve_fp32_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); },
+        REGISTER_FP32_SVE(sve_fp32_elementwise_unary),
+        nullptr,
+    },
+    {
+        "sve_fp16_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); },
+        REGISTER_FP16_SVE(sve_fp16_elementwise_unary),
+        nullptr,
+    },
+    {
+        "sve_s32_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); },
+        REGISTER_INTEGER_SVE(sve_s32_elementwise_unary),
+        nullptr,
+    },
+    {
+        "neon_fp32_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(neon_fp32_elementwise_unary),
+        nullptr,
+    },
+    {
+        "neon_fp16_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+        REGISTER_FP16_NEON(neon_fp16_elementwise_unary),
+        nullptr,
+    },
+    {
+        "neon_s32_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; },
+        REGISTER_INTEGER_NEON(neon_s32_elementwise_unary),
+        nullptr,
+    },
+#ifdef __aarch64__
+    {
+        "sve2_q8_elementwise_unary",
+        [](const DataTypeISASelectorData &data)
+        { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+        REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary),
+        &q8_prepare_lut,
+    },
+    {
+        "neon_q8_elementwise_unary",
+        [](const DataTypeISASelectorData &data)
+        { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary),
+        &q8_prepare_lut,
+    },
+#else  // __aarch64__
+    {
+        "neon_qasymm8_signed_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary),
+        nullptr,
+    },
+    {
+        "neon_qasymm8_elementwise_unary",
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary),
+        nullptr,
+    },
+#endif // __aarch64__
+};
+
+} // namespace
+
+void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
+    const auto uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    _op         = op;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name);
+
+    // If input shape is dynamic, expect a configured window and dst at run-time.
+    if (src.is_dynamic())
+    {
+        return;
+    }
+
+    if (uk->prepare_func != nullptr)
+    {
+        _lut = uk->prepare_func(op, &src, &dst);
+    }
+
+    auto shape_and_window = compute_output_shape_and_window(src.tensor_shape());
+    auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type());
+    ICpuKernel::configure(shape_and_window.second);
+}
+
+Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
+
+    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
+
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    switch (op)
+    {
+        case ElementWiseUnary::EXP:
+        case ElementWiseUnary::RSQRT:
+        case ElementWiseUnary::LOG:
+        case ElementWiseUnary::ROUND:
+        case ElementWiseUnary::SIN:
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            break;
+        case ElementWiseUnary::NEG:
+        case ElementWiseUnary::ABS:
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
+    }
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+    }
+
+    return Status{};
+}
+
+void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    _run_method(src, dst, window, _op, _lut.get());
+}
+
+const char *CpuElementwiseUnaryKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> &CpuElementwiseUnaryKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
new file mode 100644
index 0000000000..249909854e
--- /dev/null
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for an element-wise unary operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ dst(x) = OP(src(x))@f]
+ */
+class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel>
+{
+private:
+    using ElementwiseUnaryUkernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
+    using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(
+        ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
+
+public:
+    CpuElementwiseUnaryKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel);
+
+    /** Function to configure the @ref CpuElementwiseUnaryKernel
+     *
+     * @param[in]  op  Arithmetic operation to be executed.
+     * @param[in]  src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[out] dst Output tensor. Data types supported: Same as @p src.
+     */
+    void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuElementwiseUnaryKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct ElementwiseUnaryKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        ElementwiseUnaryUkernelPtr   ukernel;
+        ElementwiseUnaryPreparePtr   prepare_func;
+    };
+
+    static const std::vector<ElementwiseUnaryKernel> &get_available_kernels();
+
+private:
+    ElementWiseUnary           _op{};
+    ElementwiseUnaryUkernelPtr _run_method{nullptr};
+    std::string                _name{};
+    std::unique_ptr<uint8_t[]> _lut{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp
new file mode 100644
index 0000000000..754da97ae1
--- /dev/null
+++ b/src/cpu/kernels/CpuFillKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuFillKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void CpuFillKernel::configure(const ITensorInfo *tensor, const PixelValue &constant_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+    _constant_value = constant_value;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*tensor, Steps());
+    ICpuKernel::configure(win);
+}
+
+void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto inout = tensors.get_tensor(TensorType::ACL_SRC_DST);
+
+    // Collapse all the batches on the third dimension
+    bool   has_collapsed = true;
+    Window collapsed     = window.collapse_if_possible(window, Window::DimZ, &has_collapsed);
+    ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+    uint8_t *const start_valid_region = inout->ptr_to_element(inout->info()->valid_region().anchor);
+    const auto     window_width       = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start());
+    const size_t   element_size       = inout->info()->element_size();
+
+    // Unroll X dimension
+    collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator tensor_it(inout, collapsed);
+    execute_window_loop(
+        collapsed,
+        [&](const Coordinates &)
+        {
+            uint8_t *base_addr = start_valid_region + tensor_it.offset();
+            // Set memory
+            for (int i = 0; i < window_width; ++i)
+            {
+                std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
+            }
+        },
+        tensor_it);
+}
+
+const char *CpuFillKernel::name() const
+{
+    return "CpuFillKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h
new file mode 100644
index 0000000000..7c200c9b59
--- /dev/null
+++ b/src/cpu/kernels/CpuFillKernel.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FILL_KERNEL_H
+#define ARM_COMPUTE_CPU_FILL_KERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel for filling a tensor with a given constant value */
+class CpuFillKernel : public ICpuKernel<CpuFillKernel>
+{
+public:
+    CpuFillKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFillKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in,out] tensor         Tensor to fill. Supported data types: All
+     * @param[in]     constant_value The value used to fill the planes of the tensor
+     */
+    void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    PixelValue _constant_value{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FILL_KERNEL_H */
diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp
new file mode 100644
index 0000000000..df7e6aad46
--- /dev/null
+++ b/src/cpu/kernels/CpuFloorKernel.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuFloorKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/floor/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = {
+    {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)},
+    {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}};
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    // Validate in case of configured output
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
+
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuFloorKernel").append("/").append(uk->name);
+
+    // Configure kernel window
+    const Window win = calculate_max_window(*src, Steps());
+
+    ICPPKernel::configure(win);
+}
+
+Window CpuFloorKernel::infer_window(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_ERROR_ON(!bool(validate_arguments(src, dst)));
+
+    Window win;
+    win.use_tensor_dimensions(src->tensor_shape());
+    return win;
+}
+
+Status CpuFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    return Status{};
+}
+
+void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+    const auto     len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator src_it(src, win);
+    Iterator dst_it(dst, win);
+
+    execute_window_loop(
+        win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it);
+}
+
+const char *CpuFloorKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuFloorKernel::FloorKernel> &CpuFloorKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h
new file mode 100644
index 0000000000..57107d0532
--- /dev/null
+++ b/src/cpu/kernels/CpuFloorKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FLOOR_KERNEL_H
+#define ARM_COMPUTE_CPU_FLOOR_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Cpu accelarated kernel to perform a floor operation */
+class CpuFloorKernel : public ICpuKernel<CpuFloorKernel>
+{
+private:
+    using FloorKernelPtr = std::add_pointer<void(const void *, void *, int)>::type;
+
+public:
+    CpuFloorKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]  src Source tensor. Data type supported: F16/F32.
+     * @param[out] dst Destination tensor. Same as @p src
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuFloorKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+    /** Infer execution window
+     *
+     * @param[in] src Source tensor info. Data type supported: F16/F32.
+     * @param[in] dst Destination tensor info. Same as @p src
+     *
+     * @return an execution Window
+     */
+    Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct FloorKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        FloorKernelPtr               ukernel;
+    };
+
+    static const std::vector<FloorKernel> &get_available_kernels();
+
+private:
+    FloorKernelPtr _run_method{nullptr};
+    std::string    _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FLOOR_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
new file mode 100644
index 0000000000..db433c99a8
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+void CpuGemmInterleave4x4Kernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_interleaved_shape(*src)));
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmInterleave4x4Kernel::validate(src, dst));
+
+    Window win = calculate_max_window(*src, Steps(1, 4));
+    ICPPKernel::configure(win);
+}
+
+Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorShape dst_shape = compute_interleaved_shape(*src);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+
+void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+    /*
+    *  This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+    *         |a00 a01 a02 a03|
+    *         |a10 a11 a12 a13|
+    *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+    *         |a30 a31 a32 a33|
+    *
+    *         After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+    */
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const size_t window_start_x = window.x().start();
+    const size_t window_end_x   = window.x().end();
+
+    const size_t in_height = src->info()->dimension(1);
+    const size_t in_stride = src->info()->strides_in_bytes()[1];
+
+    const size_t partial_y = in_height % 4;
+
+    const size_t element_size = src->info()->element_size();
+
+    // Set window for the src tensor
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Set window for the dst tensor
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_out.scale(Window::DimY, 0.25f);
+
+    Iterator in(src, win);
+    Iterator out(dst, win_out);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            if (id.y() + 4 <= static_cast<int>(in_height))
+            {
+                for (size_t x = window_start_x; x < window_end_x; ++x)
+                {
+                    std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size,
+                                element_size);
+                }
+            }
+            else
+            {
+                for (size_t x = window_start_x; x < window_end_x; ++x)
+                {
+                    size_t y = 0;
+                    for (; y < partial_y; ++y)
+                    {
+                        std::memcpy(out.ptr() + (x * 4 + y) * element_size,
+                                    (in.ptr() + y * in_stride) + x * element_size, element_size);
+                    }
+                    for (; y < 4; ++y)
+                    {
+                        std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                    }
+                }
+            }
+        },
+        in, out);
+}
+
+const char *CpuGemmInterleave4x4Kernel::name() const
+{
+    return "CpuGemmInterleave4x4Kernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
new file mode 100644
index 0000000000..2ce34bc4bc
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to interleave the elements of a matrix
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class CpuGemmInterleave4x4Kernel : public ICpuKernel<CpuGemmInterleave4x4Kernel>
+{
+public:
+    CpuGemmInterleave4x4Kernel() = default;
+    /** Initialise the kernel's src and dst.
+     *
+     * @param[in]  src Input tensor info. Data types supported: All
+     * @param[out] dst Output tensor info which stores the interleaved matrix. Data type supported: same as @p src.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmInterleave4x4Kernel
+     *
+     * Similar to @ref CpuGemmInterleave4x4Kernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..a3ed2cd171
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
@@ -0,0 +1,877 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+void inline vector_matrix_multiply_u8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
+{
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            if (id.x() > width_b)
+            {
+                return;
+            }
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
+
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
+            {
+                const uint8x8_t  a00_u8 = vld1_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+                const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+                const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+                const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+                const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+                const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+                const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+                const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}};
+
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                const uint16x4x4_t b10_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}};
+
+                const uint16x4x4_t b20_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}};
+
+                const uint16x4x4_t b30_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}};
+
+                const uint16x4x4_t b40_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}};
+
+                const uint16x4x4_t b50_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}};
+
+                const uint16x4x4_t b60_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}};
+
+                const uint16x4x4_t b70_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
+
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
+            {
+                const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
+
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
+
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
+            {
+                vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+            }
+            else
+            {
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
+                }
+            }
+        },
+        ina, inb, out);
+}
+
+void inline vector_matrix_multiply_s8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
+{
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            if (id.x() > width_b)
+            {
+                return;
+            }
+
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
+
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
+            {
+                const int8x8_t  a00_s8 = vld1_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+                const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+                const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+                const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+                const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+                const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+                const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+                const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+
+                // Convert a00_s8 to int16_t and get the lower part
+                const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}};
+
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                const int16x4x4_t b10_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}};
+
+                const int16x4x4_t b20_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}};
+
+                const int16x4x4_t b30_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}};
+
+                const int16x4x4_t b40_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}};
+
+                const int16x4x4_t b50_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}};
+
+                const int16x4x4_t b60_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}};
+
+                const int16x4x4_t b70_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
+
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
+            {
+                const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
+
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
+            {
+                vst1q_s32(vec_out + 0, c0.val[0]);
+                vst1q_s32(vec_out + 4, c0.val[1]);
+                vst1q_s32(vec_out + 8, c0.val[2]);
+                vst1q_s32(vec_out + 12, c0.val[3]);
+            }
+            else
+            {
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
+                }
+            }
+        },
+        ina, inb, out);
+}
+
+void inline matrix_multiply_u8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+{
+    const auto   width_out  = static_cast<int>(out_info.dimension(0));
+    const auto   height_out = static_cast<int>(out_info.dimension(1));
+    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const uint8_t *mtx_a0 = ina.ptr();
+            const uint8_t *mtx_b0 = inb.ptr();
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 1
+            uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 2
+            uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 3
+            uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+            {
+                const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
+                const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+                // Convert b00_s8 to uint16_t
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
+                c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
+                c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
+                c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
+                c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
+                c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
+                c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
+                c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
+                c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
+                c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
+            }
+
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+
+            if (id.y() < height_out && id.x() < (width_out - 16))
+            {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+                if (id.y() + 1 < height_out)
+                {
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+                    if (id.y() + 2 < height_out)
+                    {
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+                        if (id.y() + 3 < height_out)
+                        {
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+                        }
+                    }
+                }
+            }
+            else
+            {
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
+                    }
+                }
+                if (id.y() + 1 < height_out)
+                {
+                    left_over = left_over_value;
+                    for (auto k = 0; k < 4 && left_over; ++k)
+                    {
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        {
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        }
+                    }
+                    if (id.y() + 2 < height_out)
+                    {
+                        left_over = left_over_value;
+                        for (auto k = 0; k < 4 && left_over; ++k)
+                        {
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            {
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
+                        {
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
+                            {
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        ina, inb, out);
+}
+
+void inline matrix_multiply_s8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+{
+    const auto   width_out  = static_cast<int>(out_info.dimension(0));
+    const auto   height_out = static_cast<int>(out_info.dimension(1));
+    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 1
+            int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 2
+            int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 3
+            int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+            {
+                const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
+                const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
+
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+                // Convert b00_s8 to int16_t
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
+                c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
+                c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
+                c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
+                c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
+                c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
+                c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
+                c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
+                c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
+                c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
+            }
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.y() < height_out && id.x() < (width_out - 16))
+            {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
+                vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
+                vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
+                vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
+                if (id.y() + 1 < height_out)
+                {
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
+                    if (id.y() + 2 < height_out)
+                    {
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
+                        if (id.y() + 3 < height_out)
+                        {
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+                        }
+                    }
+                }
+            }
+            else if (id.y() < height_out)
+            {
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
+                    }
+                }
+                if (id.y() + 1 < height_out)
+                {
+                    left_over = left_over_value;
+                    for (auto k = 0; k < 4 && left_over; ++k)
+                    {
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        {
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        }
+                    }
+                    if (id.y() + 2 < height_out)
+                    {
+                        left_over = left_over_value;
+                        for (auto k = 0; k < 4 && left_over; ++k)
+                        {
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            {
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
+                        {
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
+                            {
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        ina, inb, out);
+}
+
+Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+                                                         DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+
+    TensorShape in0_shape = src0->tensor_shape();
+    TensorShape in1_shape = src1->tensor_shape();
+    TensorShape out_shape = dst->tensor_shape();
+
+    // Check vector-by-matrix case
+    if (out_shape[1] == 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1],
+                                        "The number of input0's columns must be equal to input1's rows");
+    }
+    else
+    {
+        in0_shape.collapse(2);
+        in1_shape.collapse(2);
+        out_shape.collapse(2);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2],
+                                        "Output tensor must have the same number of batches of input0 tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            in1_shape[2] != 1 && in0_shape[2] != in1_shape[2],
+            "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_UNUSED(src0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst));
+
+    TensorShape in1_shape = src1->tensor_shape();
+    in1_shape.collapse(2);
+
+    _slide_matrix_b = in1_shape[2] != 1;
+
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    Window win;
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if ((dst->dimension(1) == 1))
+    {
+        // Configure kernel window
+        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
+    }
+    else
+    {
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    }
+
+    ICpuKernel::configure(win);
+}
+
+Status
+CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
+    return Status{};
+}
+
+void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
+    if ((dst->info()->dimension(1) == 1))
+    {
+        const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
+        const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
+        const auto width_out      = static_cast<int>(dst->info()->dimension(0));
+        const auto in_b_stride =
+            static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
+
+        // The implementation computes 16 elements per iteration
+        const int window_start_x = 16 * info.thread_id;
+        const int window_step_x  = 16 * info.num_threads;
+        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+        Window win_out(window);
+        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if (src1->info()->num_dimensions() >= 3)
+        {
+            win_b = window;
+        }
+        win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator ina(src0, win_a);
+        Iterator inb(src1, win_b);
+        Iterator out(dst, win_out);
+
+        switch (src0->info()->data_type())
+        {
+            case DataType::S8:
+            case DataType::QASYMM8_SIGNED:
+            {
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
+                break;
+            }
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
+        }
+    }
+    else
+    {
+        const size_t in_b_stride = src1->info()->strides_in_bytes()[1];
+        const int    width_b     = src1->info()->dimension(0);
+
+        // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
+
+        // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if (_slide_matrix_b)
+        {
+            win_b = window;
+        }
+        win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
+        win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        // The step x and step y for the output matrix has been already set using in configure()
+        Iterator ina(src0, win_a);
+        Iterator inb(src1, win_b);
+        Iterator out(dst, window);
+
+        switch (src0->info()->data_type())
+        {
+            case DataType::S8:
+            case DataType::QASYMM8_SIGNED:
+            {
+                matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window);
+                break;
+            }
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
+        }
+    }
+}
+
+const char *CpuGemmLowpMatrixMultiplyKernel::name() const
+{
+    return "CpuGemmLowpMatrixMultiplyKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..439ada1b47
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to multiply matrices
+ *
+ * @note @ref CpuGemmLowpMatrixMultiplyKernel low precision matrix product kernel
+ *  This kernel performs the following computation:
+ *
+ *  -# Convert a values from int8 to int32
+ *  -# Convert b values from int8 to int32
+ *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
+ */
+class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel<CpuGemmLowpMatrixMultiplyKernel>
+{
+public:
+    /** Default constructor */
+    CpuGemmLowpMatrixMultiplyKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * The input matrices @p src0 and @p src1 must be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel. These two
+     * kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  src0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+     * @param[in]  src1 Input tensor info containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] dst  Output tensor info to store the result of matrix multiplication. Data type supported: S32
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpMatrixMultiplyKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    bool _slide_matrix_b{true};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H*/
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
new file mode 100644
index 0000000000..9a099bd1b6
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2017-2021,2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments_matrix_a_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
+    }
+    return Status{};
+}
+Status validate_arguments_matrix_b_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
+    }
+    return Status{};
+}
+} // namespace
+
+void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info));
+    _k             = info.k;
+    _scalar        = info.scalar;
+    _mul_by_scalar = info.mul_by_scalar;
+
+    switch (src->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
+            break;
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8:
+        case DataType::QSYMM8_PER_CHANNEL:
+            _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32);
+
+    Window win = calculate_max_window(*dst, Steps(1));
+    ICpuKernel::configure(win);
+}
+
+Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
+    return Status{};
+}
+
+template <typename T>
+void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor             *src,
+                                                     ITensor                   *dst,
+                                                     const arm_compute::Window &window)
+{
+    // Intermediate and final accumulator types
+    using TIAcc = wrapper::traits::promote_t<T>;
+    using TAcc  = wrapper::traits::promote_t<TIAcc>;
+
+    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+    Window win_input(collapsed_window);
+    win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Iterator in(src, win_input);
+    Iterator out(dst, collapsed_window);
+
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+            TAcc sum_row  = 0;
+
+            const T *matrix_a = reinterpret_cast<const T *>(
+                (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+#endif /* __arm__ */
+
+            int i = 0;
+            // This for loop performs 16 accumulations
+            for (; i <= (_k - 16); i += 16)
+            {
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i);
+
+                // Partial accumulations in U16
+                const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
+
+                // Accumulate to U32
+                vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
+            }
+
+            // This for loop performs the leftover accumulations
+            for (; i < _k; ++i)
+            {
+                sum_row += static_cast<TAcc>(matrix_a[i]);
+            }
+
+#if defined(__aarch64__)
+            // Reduction operation available on 64 bit architectures only
+            sum_row += wrapper::vaddv(vsum_row);
+#else  // __aarch64__
+            auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+            tmp      = wrapper::vpadd(tmp, tmp);
+
+            sum_row += wrapper::vgetlane(tmp, 0);
+#endif // __aarch64__
+
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
+            {
+                sum_row *= _scalar;
+            }
+
+            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
+        },
+        in, out);
+}
+
+void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    (this->*_func)(src, dst, window);
+}
+
+const char *CpuGemmLowpMatrixAReductionKernel::name() const
+{
+    return "CpuGemmLowpMatrixAReductionKernel";
+}
+
+void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
+
+    _k             = info.k;
+    _scalar        = info.scalar;
+    _mul_by_scalar = info.mul_by_scalar;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    switch (src->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
+            break;
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8:
+        case DataType::QSYMM8_PER_CHANNEL:
+            _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32);
+
+    // Configure kernel window
+    Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration));
+    ICpuKernel::configure(win);
+}
+
+Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
+    return Status{};
+}
+
+template <typename T>
+void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor    *src,
+                                                     ITensor          *dst,
+                                                     const Window     &window,
+                                                     const ThreadInfo &info)
+{
+    // Intermediate and final accumulator types
+    using TIAcc = wrapper::traits::promote_t<T>;
+    using TAcc  = wrapper::traits::promote_t<TIAcc>;
+
+    Window     collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
+    const auto vec_scalar       = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
+
+    const auto width_matrix_b = static_cast<int>(src->info()->dimension(0));
+    const auto in_b_stride    = static_cast<int>(src->info()->strides_in_bytes()[1]);
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * info.thread_id;
+    const int window_step_x  = 16 * info.num_threads;
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(collapsed_window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+    Window win_in(win_out);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Iterator inb(src, win_in);
+    Iterator out(dst, win_out);
+
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &id)
+        {
+            if (id.x() > width_matrix_b)
+            {
+                return;
+            }
+
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            // 4 x u/int32x4_t = 16 column accumulators
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = {
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})};
+
+            const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+#endif /* __arm__ */
+
+            // If we have less than 16 columns left, we can't use the main unrolled loop
+            if ((width_matrix_b - id.x()) >= 16)
+            {
+                // Row index
+                int i = 0;
+                // 4 x u/int32x4_t = 16 columns unrolled across 4 rows
+                for (; i <= (_k - 4); i += 4)
+                {
+                    // Load 4 rows of 16 columns of 8bit elements
+                    // (|                   |        )
+                    // (|                   |        )
+                    // (|                   |        )
+                    // (|                   |        )
+                    const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                    const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+                    const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+                    const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+                    asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+                    asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+                    asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+#endif /* __arm__ */
+
+                    // Partial accumulation to 16bit (4 rows => 2 rows)
+                    // (|         |         |        )
+                    // (|         |         |        )
+                    typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
+                        {wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+                         wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})};
+
+                    tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+                    tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+                    tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+                    tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+                    tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+                    tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+                    tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+                    tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+                    // Accumulate to 32bit (2 rows => 1 row)
+                    // (|    |    |    |    |        )
+                    sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+                    sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+                    sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+                    sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+
+                    matrix_b += 4 * in_b_stride;
+                }
+
+                // This for loop accumulates the rows left over from the 4x unrolling above
+                for (; i < _k; ++i)
+                {
+                    const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+
+                    // Convert 8bit => 16bit
+                    const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type
+                        b0_b16[2]{wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))};
+
+                    // Accumulate to 32bit
+                    sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                    sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                    sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                    sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
+
+                    matrix_b += in_b_stride;
+                }
+            }
+            else
+            {
+                // Accumulate left over columns to sum_cols
+                for (int i = 0; i < _k; ++i) // row loop
+                {
+                    auto left_over_cols = width_matrix_b - id.x();
+                    auto l              = left_over_cols;
+                    for (auto k = 0; k < 4 && l; ++k)
+                    {
+                        for (auto j = 0; j < 4 && l; ++j, --l)
+                        {
+                            sum_col[k][j] += matrix_b[left_over_cols - l];
+                        }
+                    }
+                    matrix_b += in_b_stride;
+                }
+            }
+
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
+            {
+                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+            }
+
+            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+            if ((width_matrix_b - id.x()) >= 16)
+            {
+                wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
+                wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
+                wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
+                wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
+            }
+            else
+            {
+                auto left_over = width_matrix_b - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                    }
+                }
+            }
+        },
+        inb, out);
+}
+
+void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    (this->*_func)(src, dst, window, info);
+}
+
+const char *CpuGemmLowpMatrixBReductionKernel::name() const
+{
+    return "CpuGemmLowpMatrixBReductionKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
new file mode 100644
index 0000000000..20ef17e96d
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+struct GEMMLowpReductionKernelInfo;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel<CpuGemmLowpMatrixAReductionKernel>
+{
+public:
+    /** Default constructor */
+    CpuGemmLowpMatrixAReductionKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixAReductionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  src  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] dst  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  info Kernel metadata:
+     *                            - k            (num_mtx_a_cols) Number of matrix A columns
+     *                            - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
+     *                            - scalar       Scalar value to multiply each reduced row by.
+     *                            - mul_byscalar True if each reduced column must be multiplied by a scalar value.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpMatrixAReductionKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Execution of the reduction kernel specialized on the input type
+     *
+     * @param[in] src    Input tensor
+     * @param[in] dst    Output tensor
+     * @param[in] window Execution window
+     */
+    template <typename T>
+    void run_internal(const ITensor *src, ITensor *dst, const Window &window);
+
+    /** Common signature for all reduction functions
+     *
+     * @param[in]  src    Input tensor
+     * @param[out] dst    Output tensor
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src,
+                                                                                             ITensor       *dst,
+                                                                                             const Window  &window);
+
+    CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
+};
+
+/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel<CpuGemmLowpMatrixBReductionKernel>
+{
+public:
+    /** Default constructor */
+    CpuGemmLowpMatrixBReductionKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixBReductionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  src  Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] dst  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  info Kernel metadata:
+     *                            - k            (num_mtx_b_rows) Number of matrix B rows.
+     *                            - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
+     *                            - scalar       Scalar value to multiply each reduced row by.
+     *                            - mul_byscalar True if each reduced row must be multiplied by a scalar value.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpMatrixBReductionKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Execution of the reduction kernel specialized on the input type
+     *
+     * @param[in] src    Input tensor
+     * @param[in] dst    Output tensor
+     * @param[in] window Execution window
+     * @param[in] info   Thread-related information
+     */
+    template <typename T>
+    void run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
+
+    /** Common signature for all reduction functions
+     *
+     * @param[in]  src    Input tensor
+     * @param[out] dst    Output tensor
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor    *src,
+                                                                                             ITensor          *dst,
+                                                                                             const Window     &window,
+                                                                                             const ThreadInfo &info);
+
+    CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000000..2a76a5958d
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2017-2022,2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32, DataType::F32);
+
+    // We run if the offset is nonzero or a sum col has been provided, we need
+    // the second option in case the QuantizationInfo is dynamic
+    if (a_offset != 0 || vector_sum_col != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // We run if the offset is nonzero or a sum row has been provided, we need
+    // the second option in case the QuantizationInfo is dynamic
+    if (b_offset != 0 || vector_sum_row != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if (output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if (vector_sum_col != nullptr)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    return Status{};
+}
+
+void run_offset_contribution_float(const Window  &window,
+                                   ITensor       *mm_result,
+                                   const ITensor *vector_sum_col,
+                                   const ITensor *vector_sum_row,
+                                   int32_t        a_offset,
+                                   int32_t        b_offset,
+                                   int32_t        k_offset,
+                                   float          scale,
+                                   bool           slide_vector_sum_col,
+                                   bool           is_gemm3d)
+{
+    Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
+    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16;
+
+    // if vector_sum_col is nullptr then stride_y is 0, else get stride_y
+    const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0;
+    Iterator     mm_result_it(mm_result, collapsed_window);
+
+    if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int    batch_id         = id.z() / depth_input;
+                const size_t batch_offset_col = batch_id * (sum_col_stride_y);
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<float *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
+
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    // Add a_offset_term_s32 and b_offset_term_s32
+                    int32x4x4_t offset_term_s32 = {
+                        {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
+
+                    offset_term_s32.val[0] =
+                        vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+                    offset_term_s32.val[1] =
+                        vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+                    offset_term_s32.val[2] =
+                        vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+                    offset_term_s32.val[3] =
+                        vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+                    float32x4x4_t in_f32 = {{vld1q_f32(mm_result_ptr + x + 0), vld1q_f32(mm_result_ptr + x + 4),
+                                             vld1q_f32(mm_result_ptr + x + 8), vld1q_f32(mm_result_ptr + x + 12)}};
+
+                    // Convert and scale the S32 offsets to match the already scaled GEMM results
+                    float32x4x4_t offset_terms_scaled = {{
+                        vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[0]), scale),
+                        vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[1]), scale),
+                        vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[2]), scale),
+                        vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[3]), scale),
+                    }};
+
+                    // Add the offset terms to the GEMM result
+                    in_f32.val[0] = vaddq_f32(in_f32.val[0], offset_terms_scaled.val[0]);
+                    in_f32.val[1] = vaddq_f32(in_f32.val[1], offset_terms_scaled.val[1]);
+                    in_f32.val[2] = vaddq_f32(in_f32.val[2], offset_terms_scaled.val[2]);
+                    in_f32.val[3] = vaddq_f32(in_f32.val[3], offset_terms_scaled.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_f32(mm_result_ptr + x + 0, in_f32.val[0]);
+                    vst1q_f32(mm_result_ptr + x + 4, in_f32.val[1]);
+                    vst1q_f32(mm_result_ptr + x + 8, in_f32.val[2]);
+                    vst1q_f32(mm_result_ptr + x + 12, in_f32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                    a_offset_term_s32 *= a_offset;
+
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += (k_offset + a_offset_term_s32 + b_offset_term_s32) * scale;
+                }
+            },
+            vector_sum_col_it, vector_sum_row_it, mm_result_it);
+    }
+    else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int batch_id      = id.z() / depth_input;
+                auto      mm_result_ptr = reinterpret_cast<float *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t row_sum =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                float scaled_b_offset_term_f32 = row_sum * b_offset * scale;
+
+                const float32x4_t b_offset_term_f32_vec = vdupq_n_f32(scaled_b_offset_term_f32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    float32x4x4_t in_f32 = {{vld1q_f32(mm_result_ptr + x + 0), vld1q_f32(mm_result_ptr + x + 4),
+                                             vld1q_f32(mm_result_ptr + x + 8), vld1q_f32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_f32.val[0] = vaddq_f32(in_f32.val[0], b_offset_term_f32_vec);
+                    in_f32.val[1] = vaddq_f32(in_f32.val[1], b_offset_term_f32_vec);
+                    in_f32.val[2] = vaddq_f32(in_f32.val[2], b_offset_term_f32_vec);
+                    in_f32.val[3] = vaddq_f32(in_f32.val[3], b_offset_term_f32_vec);
+
+                    // Store the result with the offset contribution
+                    vst1q_f32(mm_result_ptr + x + 0, in_f32.val[0]);
+                    vst1q_f32(mm_result_ptr + x + 4, in_f32.val[1]);
+                    vst1q_f32(mm_result_ptr + x + 8, in_f32.val[2]);
+                    vst1q_f32(mm_result_ptr + x + 12, in_f32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += scaled_b_offset_term_f32;
+                }
+            },
+            vector_sum_row_it, mm_result_it);
+    }
+    else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int    batch_id = id.z() / depth_input;
+                const size_t batch_offset_col =
+                    batch_id *
+                    (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<float *>(mm_result_it.ptr());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    float32x4x4_t a_offset_term_scaled = {{
+                        vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[0]), scale),
+                        vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[1]), scale),
+                        vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[2]), scale),
+                        vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[3]), scale),
+                    }};
+
+                    float32x4x4_t in_f32 = {{vld1q_f32(mm_result_ptr + x + 0), vld1q_f32(mm_result_ptr + x + 4),
+                                             vld1q_f32(mm_result_ptr + x + 8), vld1q_f32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_f32.val[0] = vaddq_f32(in_f32.val[0], a_offset_term_scaled.val[0]);
+                    in_f32.val[1] = vaddq_f32(in_f32.val[1], a_offset_term_scaled.val[1]);
+                    in_f32.val[2] = vaddq_f32(in_f32.val[2], a_offset_term_scaled.val[2]);
+                    in_f32.val[3] = vaddq_f32(in_f32.val[3], a_offset_term_scaled.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_f32(mm_result_ptr + x + 0, in_f32.val[0]);
+                    vst1q_f32(mm_result_ptr + x + 4, in_f32.val[1]);
+                    vst1q_f32(mm_result_ptr + x + 8, in_f32.val[2]);
+                    vst1q_f32(mm_result_ptr + x + 12, in_f32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += a_offset_term_s32 * a_offset * scale;
+                }
+            },
+            vector_sum_col_it, mm_result_it);
+    }
+    else // false, false
+    {
+        // No offset contribution from matrix A and matrix B
+        return;
+    }
+}
+
+void run_offset_contribution(const Window  &window,
+                             ITensor       *mm_result,
+                             const ITensor *vector_sum_col,
+                             const ITensor *vector_sum_row,
+                             int32_t        a_offset,
+                             int32_t        b_offset,
+                             int32_t        k_offset,
+                             bool           slide_vector_sum_col,
+                             bool           is_gemm3d)
+{
+    Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
+    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16;
+
+    // if vector_sum_col is nullptr then stride_y is 0, else get stride_y
+    const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0;
+    Iterator     mm_result_it(mm_result, collapsed_window);
+
+    if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int    batch_id         = id.z() / depth_input;
+                const size_t batch_offset_col = batch_id * (sum_col_stride_y);
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
+
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    // Add a_offset_term_s32 and b_offset_term_s32
+                    int32x4x4_t offset_term_s32 = {
+                        {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
+
+                    offset_term_s32.val[0] =
+                        vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+                    offset_term_s32.val[1] =
+                        vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+                    offset_term_s32.val[2] =
+                        vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+                    offset_term_s32.val[3] =
+                        vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                    a_offset_term_s32 *= a_offset;
+
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
+                }
+            },
+            vector_sum_col_it, vector_sum_row_it, mm_result_it);
+    }
+    else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int batch_id      = id.z() / depth_input;
+                auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
+
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += b_offset_term_s32;
+                }
+            },
+            vector_sum_row_it, mm_result_it);
+    }
+    else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int    batch_id = id.z() / depth_input;
+                const size_t batch_offset_col =
+                    batch_id *
+                    (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Compute the leftover term due to a_offset.
+                    const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += a_offset_term_s32 * a_offset;
+                }
+            },
+            vector_sum_col_it, mm_result_it);
+    }
+    else // false, false
+    {
+        // No offset contribution from matrix A and matrix B
+        return;
+    }
+}
+} // namespace
+
+void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result,
+                                                    ITensorInfo *vector_sum_col,
+                                                    ITensorInfo *vector_sum_row,
+                                                    int32_t      k,
+                                                    int32_t      a_offset,
+                                                    int32_t      b_offset,
+                                                    float        scale)
+{
+    // Perform validate step
+    ARM_COMPUTE_UNUSED(vector_sum_row);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+
+    _a_offset = a_offset;
+    _b_offset = b_offset;
+    _k        = k;
+
+    _scale = scale;
+
+    if (vector_sum_col != nullptr)
+    {
+        // Check if vector_sum_col_shape should be slidden or not
+        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps());
+    ICpuKernel::configure(win);
+}
+
+void CpuGemmLowpOffsetContributionKernel::set_a_offset(int32_t a_offset)
+{
+    _a_offset = a_offset;
+}
+
+void CpuGemmLowpOffsetContributionKernel::set_b_offset(int32_t b_offset)
+{
+    _b_offset = b_offset;
+}
+
+void CpuGemmLowpOffsetContributionKernel::set_scale(float scale)
+{
+    _scale = scale;
+}
+
+Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                     const ITensorInfo *vector_sum_col,
+                                                     const ITensorInfo *vector_sum_row,
+                                                     int32_t            a_offset,
+                                                     int32_t            b_offset)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+    return Status{};
+}
+
+void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto mm_result      = tensors.get_tensor(TensorType::ACL_DST);
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+
+    // check to see what is the output type of result
+    auto k_offset = _a_offset * _b_offset * _k;
+    if (mm_result->info()->data_type() == DataType::F32)
+    {
+        run_offset_contribution_float(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, k_offset,
+                                      _scale, _slide_vector_sum_col, reinterpret_as_3d);
+    }
+    else
+    {
+        run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, k_offset,
+                                _slide_vector_sum_col, reinterpret_as_3d);
+    }
+}
+
+const char *CpuGemmLowpOffsetContributionKernel::name() const
+{
+    return "CpuGemmLowpOffsetContributionKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
new file mode 100644
index 0000000000..ecbfb0c282
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017-2022,2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to add the offset contribution after @ref CpuGemmLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class CpuGemmLowpOffsetContributionKernel : public ICpuKernel<CpuGemmLowpOffsetContributionKernel>
+{
+public:
+    /** Default constructor */
+    CpuGemmLowpOffsetContributionKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in, out] mm_result      Input tensor containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      k              Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
+     * @param[in]      scale          (Optional) multiplies the contribution to make it the same scale as the dst in the case where mm_result is float
+     *                                (and so has already been scaled). Default is 1.0
+     */
+    void configure(ITensorInfo *mm_result,
+                   ITensorInfo *vector_sum_col,
+                   ITensorInfo *vector_sum_row,
+                   int32_t      k,
+                   int32_t      a_offset,
+                   int32_t      b_offset,
+                   float        scale = 1.0f);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpOffsetContributionKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
+
+    /** Set the a offset
+     * Warning: if a_offset is non-zero then vector_sum_col must be set in run_op.
+     *          Run configure or validate again if you aren't sure
+     *
+     * @param[in] a_offset Offset to be added to each element of the matrix A.
+     */
+    void set_a_offset(int32_t a_offset);
+
+    /** Set the b offset
+     * Warning: if b_offset is non-zero then vector_sum_row must be set in run_op.
+     *          Run configure or validate again if you aren't sure
+     *
+     * @param[in] b_offset Offset to be added to each element of the matrix B.
+     */
+    void set_b_offset(int32_t b_offset);
+
+    /** Set the dequantize scale
+     *
+     * @param[in] scale Multiplies the contribution to make it the same scale as the dst in the case where
+     *                  mm_result is float (and so has already been scaled).
+     */
+    void set_scale(float scale);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    int32_t _a_offset{0};
+    int32_t _b_offset{0};
+    int32_t _k{0}; // Number of columns of A or rows of B, used in last offset term
+    float   _scale{1.0};
+    bool    _slide_vector_sum_col{true};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000000..3c113f2828
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,1036 @@
+/*
+ * Copyright (c) 2019-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
+{
+    return {{vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)}};
+}
+
+inline int32x4x4_t load(const int32_t *ptr, int32_t x)
+{
+    return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}};
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
+{
+    return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}};
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
+{
+    return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]),
+             vaddq_s32(a.val[3], b.val[3])}};
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
+{
+    return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar),
+             vmulq_n_s32(a.val[3], mul_scalar)}};
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
+{
+    return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+             vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}};
+}
+
+inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
+{
+    int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
+
+    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+    return a_offset_term_s32;
+}
+
+inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset)
+{
+    int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr);
+    b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, b_offset);
+    return b_offset_term_s32;
+}
+
+inline int32x4x4_t get_k_offset(int32_t k_offset)
+{
+    return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
+}
+
+inline uint8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
+
+    // Convert S16 to U8
+    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+    if (is_bounded_relu)
+    {
+        out_u8 = vmaxq_u8(out_u8, min_u8);
+        out_u8 = vminq_u8(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if (is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0]));
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1]));
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2]));
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3]));
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if (is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
+template <typename T>
+struct VectorTyper
+{
+    using stype = T;
+    using vtype = typename wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128>;
+};
+
+inline Window get_win_vector_sum(const Window &window)
+{
+    Window win_vector_sum(window);
+    win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    return win_vector_sum;
+}
+
+inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col)
+{
+    Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window));
+    return vector_sum_col_it;
+}
+
+inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row)
+{
+    Window win_vector_sum_row = get_win_vector_sum(window);
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+    return vector_sum_row_it;
+}
+
+inline Iterator get_bias_it(const Window &window, const ITensor *bias)
+{
+    Window win_bias(window);
+    win_bias.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    Iterator bias_it(bias, win_bias);
+    return bias_it;
+}
+
+template <typename VT>
+inline void run_offset_contribution_output_stage_window(const int32_t     *vector_sum_col_ptr,
+                                                        const int32_t     *vector_sum_row_ptr,
+                                                        const int32_t     *bias_ptr,
+                                                        Iterator           mm_result_it,
+                                                        Iterator           out_it,
+                                                        const int32x4_t    result_offset_s32,
+                                                        const int32x4_t    result_shift_s32,
+                                                        typename VT::vtype min_vec,
+                                                        typename VT::vtype max_vec,
+                                                        int32_t            a_offset,
+                                                        int32_t            b_offset,
+                                                        int32_t            k_offset,
+                                                        int32_t            multiplier,
+                                                        int32_t            shift,
+                                                        int32_t            offset,
+                                                        int32_t            min_bound,
+                                                        int32_t            max_bound,
+                                                        int                window_step_x,
+                                                        int                window_start_x,
+                                                        int                window_end_x,
+                                                        bool               has_a_offset,
+                                                        bool               has_b_offset,
+                                                        bool               has_bias,
+                                                        bool               is_bounded_relu,
+                                                        bool               is_fixed_point)
+{
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
+    {
+        // Combine quantization offset with other offsets.
+        offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
+    }
+    if (has_a_offset && has_b_offset)
+    {
+        offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
+    }
+    if (has_b_offset)
+    {
+        offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
+    }
+
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+        if (has_a_offset)
+        {
+            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+        }
+        if (has_bias)
+        {
+            in_s32 = add_s32(in_s32, load(bias_ptr, x));
+        }
+        if (!is_fixed_point || has_b_offset)
+        {
+            in_s32 = add_s32(in_s32, offset_term_s32);
+        }
+        if (!is_fixed_point)
+        {
+            in_s32 = mul_s32(in_s32, multiplier);
+        }
+
+        if (is_fixed_point)
+        {
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
+        }
+        else
+        {
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
+        }
+    }
+    // Compute left-over elements
+    for (; x < window_end_x; ++x)
+    {
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+        if (has_a_offset)
+        {
+            in_value += (*(vector_sum_col_ptr + x) * a_offset);
+        }
+        if (has_bias)
+        {
+            in_value += *(bias_ptr + x);
+        }
+
+        if (is_fixed_point)
+        {
+            // Finalize and store the result
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                finalize_quantization(in_value, multiplier, shift, offset, static_cast<typename VT::stype>(min_bound),
+                                      static_cast<typename VT::stype>(max_bound), is_bounded_relu);
+        }
+        else
+        {
+            // Finalize quantization
+            in_value = (in_value * multiplier) >> shift;
+
+            // Bound and store the result
+            if (is_bounded_relu)
+            {
+                in_value = static_cast<typename VT::stype>(
+                    std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+            }
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                static_cast<typename VT::stype>(std::max<int32_t>(
+                    static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
+                    std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
+        }
+    }
+}
+
+inline void run_offset_contribution_output_stage_window_symm(const int32_t  *vector_sum_col_ptr,
+                                                             const int32_t  *bias_ptr,
+                                                             Iterator        mm_result_it,
+                                                             Iterator        out_it,
+                                                             const int32_t  *result_multipliers,
+                                                             const int32_t  *result_shifts,
+                                                             const int32x4_t result_offset,
+                                                             int8x16_t       min_s8,
+                                                             int8x16_t       max_s8,
+                                                             int32_t         a_offset,
+                                                             int32_t         offset,
+                                                             int32_t         min_bound,
+                                                             int32_t         max_bound,
+                                                             int             window_step_x,
+                                                             int             window_start_x,
+                                                             int             window_end_x,
+                                                             bool            has_a_offset,
+                                                             bool            has_bias,
+                                                             bool            is_bounded_relu,
+                                                             bool            is_fixed_point)
+{
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
+    {
+        // Combine quantization offset with other offsets.
+        offset_term_s32 = add_s32(offset_term_s32, result_offset);
+    }
+
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+        if (has_a_offset)
+        {
+            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+        }
+        if (has_bias)
+        {
+            in_s32 = add_s32(in_s32, load(bias_ptr, x));
+        }
+        if (!is_fixed_point)
+        {
+            in_s32 = add_s32(in_s32, offset_term_s32);
+            in_s32 = mul_s32(in_s32, result_multipliers + x);
+        }
+
+        if (is_fixed_point)
+        {
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                     finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x),
+                                                result_offset, min_s8, max_s8, is_bounded_relu));
+        }
+        else
+        {
+            vst1q_s8(
+                reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
+        }
+    }
+    // Compute left-over elements
+    for (; x < window_end_x; ++x)
+    {
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+        if (has_a_offset)
+        {
+            in_value += (*(vector_sum_col_ptr + x) * a_offset);
+        }
+        if (has_bias)
+        {
+            in_value += *(bias_ptr + x);
+        }
+
+        if (is_fixed_point)
+        {
+            // Finalize and store the result
+            *(out_it.ptr() + x) =
+                finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset,
+                                      static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
+        }
+        else
+        {
+            // Finalize quantization
+            in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
+
+            // Bound and store the result
+            if (is_bounded_relu)
+            {
+                in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+            }
+            *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+        }
+    }
+}
+
+template <typename T>
+void run_offset_contribution_output_stage(const Window           &window,
+                                          const ITensor          *mm_result,
+                                          const ITensor          *vector_sum_col,
+                                          const ITensor          *vector_sum_row,
+                                          const ITensor          *bias,
+                                          ITensor                *output,
+                                          int32_t                 a_offset,
+                                          int32_t                 b_offset,
+                                          int32_t                 k_offset,
+                                          bool                    is_vector_sum_col_batched,
+                                          GEMMLowpOutputStageInfo output_stage,
+                                          bool                    is_gemm3d,
+                                          bool                    is_bounded_relu,
+                                          bool                    is_fixed_point)
+{
+    //  Semantics of XYZW Explained for each tensor
+    //
+    //  | Tensor            |    XYZW when is_gemm3d == false       |    XYZW when is_gemm3d == true                    |
+    // -------------------------------------------------------------------------------------------------------------------
+    //  | mm_result         |  x -> width,  y -> height, z -> batch |  x -> width, y -> height, z -> depth, w -> batch  |
+    //  | collapsed window  |  x -> width,  y -> height, z -> batch |  x -> width, y -> height, z -> depth * batch      |
+    //  | vector_sum_row    |  x -> height, y -> batch              |  x -> height * depth, y -> batch                  |
+    //  | Vector_sum_col    |  x -> width,  y -> batch              |  x -> width, y -> batch                           |
+
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    using Typer        = VectorTyper<T>;
+
+    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
+    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    const int32_t multiplier = output_stage.gemmlowp_multiplier;
+    const int32_t shift      = output_stage.gemmlowp_shift;
+    const int32_t offset     = output_stage.gemmlowp_offset;
+    const int32_t min_bound  = output_stage.gemmlowp_min_bound;
+    const int32_t max_bound  = output_stage.gemmlowp_max_bound;
+
+    const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
+    const int32x4_t result_shift_s32  = vdupq_n_s32(is_fixed_point ? shift : -shift);
+    const auto      min_vec           = wrapper::vdup_n(static_cast<T>(min_bound), ExactTagType{});
+    const auto      max_vec           = wrapper::vdup_n(static_cast<T>(max_bound), ExactTagType{});
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+    Iterator mm_result_it(mm_result, win);
+    Iterator out_it(output, win);
+
+    if ((a_offset != 0) && (b_offset != 0))
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        // Offset in case vector_sum_col is batched in y dimension
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+
+        if (bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+                        mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset,
+                        k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32,
+                        result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false,
+                        is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
+        }
+    }
+    else if ((a_offset == 0) && (b_offset != 0))
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        if (bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        false, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_row_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_row_it, mm_result_it, out_it);
+        }
+    }
+    else if ((a_offset != 0) && (b_offset == 0))
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+        // Offset in case vector_sum_col is batched in y dimension
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+
+        if (bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        true, false, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
+        }
+    }
+    else
+    {
+        if (bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier,
+                        shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false,
+                        true, is_bounded_relu, is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec,
+                        max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                mm_result_it, out_it);
+        }
+        return;
+    }
+}
+
+void run_offset_contribution_output_stage_symm(const Window           &window,
+                                               const ITensor          *mm_result,
+                                               const ITensor          *vector_sum_col,
+                                               const ITensor          *vector_sum_row,
+                                               const ITensor          *bias,
+                                               ITensor                *output,
+                                               int32_t                 a_offset,
+                                               int32_t                 b_offset,
+                                               int32_t                 k_offset,
+                                               bool                    is_vector_sum_col_batched,
+                                               GEMMLowpOutputStageInfo output_stage,
+                                               bool                    is_gemm3d,
+                                               bool                    is_bounded_relu,
+                                               bool                    is_fixed_point)
+{
+    ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
+
+    const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    const int32_t offset    = output_stage.gemmlowp_offset;
+    const int32_t min_bound = output_stage.gemmlowp_min_bound;
+    const int32_t max_bound = output_stage.gemmlowp_max_bound;
+
+    const int32_t  *result_multipliers = output_stage.gemmlowp_multipliers.data();
+    const int32_t  *result_shifts      = output_stage.gemmlowp_shifts.data();
+    const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
+    const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
+    const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+    Iterator mm_result_it(mm_result, win);
+    Iterator out_it(output, win);
+
+    if (a_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+        // Offset in case vector_sum_col is batched in y dimension
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+
+        if (bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts,
+                        result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x,
+                        window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
+        }
+    }
+    else
+    {
+        if (bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32,
+                        min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, false, false, is_bounded_relu, is_fixed_point);
+                },
+                mm_result_it, out_it);
+        }
+        return;
+    }
+}
+
+Status validate_arguments(const ITensorInfo      *mm_result,
+                          const ITensorInfo      *vector_sum_col,
+                          const ITensorInfo      *vector_sum_row,
+                          const ITensorInfo      *bias,
+                          const ITensorInfo      *output,
+                          int32_t                 a_offset,
+                          int32_t                 b_offset,
+                          GEMMLowpOutputStageInfo output_stage)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+    if (output->data_type() != DataType::QASYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 &&
+                                    b_offset != 0);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN &&
+                                output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if (a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->num_dimensions() > 2);
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if (b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = output->tensor_shape();
+        if (output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if (a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+
+        // Check Tensor Rank of vector_sum_row
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3);
+    }
+
+    if (output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo      *mm_result,
+                                                               const ITensorInfo      *vector_sum_col,
+                                                               const ITensorInfo      *vector_sum_row,
+                                                               const ITensorInfo      *bias,
+                                                               ITensorInfo            *dst,
+                                                               int32_t                 k,
+                                                               int32_t                 a_offset,
+                                                               int32_t                 b_offset,
+                                                               GEMMLowpOutputStageInfo output_stage)
+{
+    ARM_COMPUTE_UNUSED(vector_sum_row, bias);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
+
+    _a_offset     = a_offset;
+    _b_offset     = b_offset;
+    _k            = k;
+    _output_stage = output_stage;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if (a_offset != 0)
+    {
+        // Check if vector_sum_col_shape should be slidden or not
+        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        _is_vector_sum_col_batched = vector_sum_col->tensor_shape().num_dimensions() > 1;
+    }
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, mm_result->clone()->set_data_type(DataType::QASYMM8));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps());
+
+    // Note: This kernel performs 16 elements per iteration.
+    // However, since we use a left-over for loop, we cannot have any read or write out of memory
+    // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped
+    ICpuKernel::configure(win);
+}
+
+Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo      *mm_result,
+                                                                const ITensorInfo      *vector_sum_col,
+                                                                const ITensorInfo      *vector_sum_row,
+                                                                const ITensorInfo      *bias,
+                                                                const ITensorInfo      *output,
+                                                                int32_t                 a_offset,
+                                                                int32_t                 b_offset,
+                                                                GEMMLowpOutputStageInfo output_stage)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+    return Status{};
+}
+
+void CpuGemmLowpOffsetContributionOutputStageKernel::set_a_offset(int32_t a_offset)
+{
+    _a_offset = a_offset;
+}
+
+void CpuGemmLowpOffsetContributionOutputStageKernel::set_b_offset(int32_t b_offset)
+{
+    _b_offset = b_offset;
+}
+
+void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                            const Window     &window,
+                                                            const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto mm_result      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_3);
+    auto dst            = tensors.get_tensor(TensorType::ACL_DST);
+
+    PixelValue type_min{};
+    PixelValue type_max{};
+    std::tie(type_min, type_max) = get_min_max(dst->info()->data_type());
+    int32_t type_min_int         = type_min.get<int32_t>();
+    int32_t type_max_int         = type_max.get<int32_t>();
+
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+
+    const bool is_bounded_relu =
+        !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+
+    // Check if we need to perform fixed point requantization
+    const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+
+    // Check if symmetric per-channel execution
+    const bool is_signed = dst->info()->data_type() == DataType::QASYMM8_SIGNED;
+
+    // Check if symmetric per-channel execution
+    const bool is_symm = _output_stage.is_quantized_per_channel;
+
+    auto k_offset = _a_offset * _b_offset * _k;
+    if (is_symm)
+    {
+        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst,
+                                                  _a_offset, _b_offset, k_offset, _is_vector_sum_col_batched,
+                                                  _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+    }
+    else
+    {
+        if (is_signed)
+        {
+            run_offset_contribution_output_stage<int8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        }
+        else
+        {
+            run_offset_contribution_output_stage<uint8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        }
+    }
+}
+
+const char *CpuGemmLowpOffsetContributionOutputStageKernel::name() const
+{
+    return "CpuGemmLowpOffsetContributionOutputStageKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
new file mode 100644
index 0000000000..ff706ff3dc
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to add the offset contribution and perform the output stage after @ref CpuGemmLowpMatrixMultiplyKernel.
+ *
+ * The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
+ * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
+ *
+ * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
+ *
+ * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
+ *
+ * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
+ *
+ * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * and mm_result'[i][k] = mm_result[i][k] +
+ *                        (vector_sum_col[k] * a_offset) +
+ *                        (vector_sum_row[i] * b_offset) +
+ *                        (a_offset * b_offset * k)
+ */
+
+class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel<CpuGemmLowpOffsetContributionOutputStageKernel>
+{
+public:
+    /** Default constructor */
+    CpuGemmLowpOffsetContributionOutputStageKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionOutputStageKernel);
+    /** Initialise the kernel inputs and output.
+     *
+     * @param[in]  mm_result      Input tensor info containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]  vector_sum_col Input row-vector tensor info of sums of all the entries in each column of matrix B.
+     *                            Can be a 1D or 2D tensor, in case of 2D, y dim is the batch dimension
+     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  vector_sum_row Input row-vector tensor info of sums of all the entries in each row of matrix A.
+     *                            Can be a 1D or 2D tensor, in case of 2D, y dim is the batch dimension
+     * @param[in]  bias           Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
+     * @param[out] dst            Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  k              Number of matrix A columns or Matrix B rows
+     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
+     * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
+     */
+    void configure(const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   ITensorInfo            *dst,
+                   int32_t                 k,
+                   int32_t                 a_offset,
+                   int32_t                 b_offset,
+                   GEMMLowpOutputStageInfo output_stage);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpOffsetContributionOutputStageKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *mm_result,
+                           const ITensorInfo      *vector_sum_col,
+                           const ITensorInfo      *vector_sum_row,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           int32_t                 a_offset,
+                           int32_t                 b_offset,
+                           GEMMLowpOutputStageInfo output_stage);
+
+    /** Set the a offset
+     * Warning: if a_offset is non-zero then vector_sum_col must be set in run_op.
+     *          Run configure or validate again if you aren't sure
+     *
+     * @param[in] a_offset Offset to be added to each element of the matrix A.
+     */
+    void set_a_offset(int32_t a_offset);
+
+    /** Set the b offset
+     * Warning: if b_offset is non-zero then vector_sum_col must be set in run_op.
+     *          Run configure or validate again if you aren't sure
+     *
+     * @param[in] b_offset Offset to be added to each element of the matrix B.
+     */
+    void set_b_offset(int32_t b_offset);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Function to use for the particular tensors passed to configure() */
+    int32_t                 _a_offset{0};
+    int32_t                 _b_offset{0};
+    int32_t                 _k{0}; // Number of columns of A or rows of B, used in last offset term
+    bool                    _is_vector_sum_col_batched{true};
+    GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
new file mode 100644
index 0000000000..eefc294700
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+
+    // Check biases if exist
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        if (dst->data_type() != output_stage->output_data_type &&
+            (output_stage->output_data_type == DataType::QASYMM8 ||
+             output_stage->output_data_type == DataType::QASYMM8_SIGNED))
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
+        }
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+
+inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
+{
+    // Add the offset terms to GEMM's result
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
+
+    // Multiply by result_mult_int
+    in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
+    in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
+    in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
+    in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
+}
+
+template <typename T>
+inline
+    typename std::enable_if<std::is_same<T, uint8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+    convert_to_8bit(const int16x8x2_t in_s16)
+{
+    return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+convert_to_8bit(const int16x8x2_t in_s16)
+{
+    return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
+}
+
+template <typename T>
+inline typename wrapper::traits::neon_vector<T, 16>::type
+finalize_quantization(int32x4x4_t                                       &in_s32,
+                      int32x4_t                                          result_shift_s32,
+                      typename wrapper::traits::neon_vector<T, 16>::type min,
+                      typename wrapper::traits::neon_vector<T, 16>::type max)
+{
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
+
+    // Convert S16 to S8 or U8
+    typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
+
+    out = wrapper::vmax(out, min);
+    out = wrapper::vmin(out, max);
+
+    return out;
+}
+} // namespace
+
+template <typename T>
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src,
+                                                           const ITensor *bias,
+                                                           ITensor       *dst,
+                                                           const Window  &window)
+{
+    using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
+
+    const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset);
+    const int32x4_t result_shift_s32  = vdupq_n_s32(-_output_stage->gemmlowp_shift);
+    const int       window_step_x     = 16;
+    const auto      window_start_x    = static_cast<int>(window.x().start());
+    const auto      window_end_x      = static_cast<int>(window.x().end());
+
+    const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits<T>::lowest();
+    const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits<T>::max();
+
+    VectorType min = wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{});
+    VectorType max = wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{});
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    if (bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias_i(bias, win_biases);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
+                    int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
+
+                    // Quantize
+                    in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) *
+                                _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
+
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, bias_i, out);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
+
+                    // Quantize
+                    in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
+
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, out);
+    }
+}
+
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo                   *src,
+                                                        ITensorInfo                   *bias,
+                                                        ITensorInfo                   *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
+{
+    ARM_COMPUTE_UNUSED(bias);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, output_stage);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
+
+    _output_stage = output_stage;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    ICpuKernel::configure(win);
+
+    // Check if we need to clamp the result using min and max
+    _is_bounded_relu =
+        ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) &&
+         !(_output_stage->gemmlowp_min_bound ==
+               std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) &&
+           _output_stage->gemmlowp_max_bound ==
+               std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
+    if (_output_stage->output_data_type == DataType::QASYMM8)
+    {
+        _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>;
+    }
+    else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
+    {
+        _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Data type not supported");
+    }
+}
+
+Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                         const ITensorInfo             *bias,
+                                                         const ITensorInfo             *dst,
+                                                         const GEMMLowpOutputStageInfo *output_stage)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
+    return Status{};
+}
+
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+    (this->*_func)(src, bias, dst, window);
+}
+
+const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const
+{
+    return "CpuGemmLowpQuantizeDownInt32ScaleKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
new file mode 100644
index 0000000000..33e296b251
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *  -#  -to the [0..255] range and cast to QASYMM8.
+ *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ */
+class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ScaleKernel>
+{
+public:
+    CpuGemmLowpQuantizeDownInt32ScaleKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ScaleKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  src          Input tensor info. Data type supported: S32
+     * @param[in]  bias         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] dst          Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] output_stage GEMMLowp output stage metadata.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpQuantizeDownInt32ScaleKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel
+     *
+     * @param[in]  src    Input tensor info
+     * @param[in]  bias   Biases tensor info
+     * @param[out] dst    Output tensor info
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window())
+     */
+    template <typename T>
+    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+
+    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ScaleKernel functions
+     *
+     * @param[in]  src    Input tensor info
+     * @param[in]  bias   Biases tensor info
+     * @param[out] dst    Output tensor info
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src,
+                                                                                      const ITensor *bias,
+                                                                                      ITensor       *dst,
+                                                                                      const Window  &window);
+
+    QuantizeDownFunctionPtr        _func{nullptr};
+    const GEMMLowpOutputStageInfo *_output_stage{nullptr};
+    bool                           _is_bounded_relu{false};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..a5c09c9977
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NESymm.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
+
+    // Check biases if exist
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
+    }
+
+    return Status{};
+}
+} // namespace
+
+template <bool is_bounded_relu>
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
+{
+    const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min));
+    const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_s16);
+    ARM_COMPUTE_UNUSED(max_s16);
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win_collapsed);
+    Iterator out(dst, win_collapsed);
+    if (bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias_i(bias, win_biases);
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
+
+                    const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                                                   vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out, bias_i);
+    }
+    else
+    {
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
+
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                    ARM_COMPUTE_UNUSED(in_value);
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out);
+    }
+}
+
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          min,
+                                                                           int          max)
+{
+    // Perform validate step
+    ARM_COMPUTE_UNUSED(bias, dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max));
+
+    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+    _result_shift                 = result_shift;
+    _min                          = min;
+    _max                          = max;
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*src, src->clone()->set_data_type(DataType::QSYMM16));
+    // Configure kernel window
+    Window win_config = calculate_max_window(*src, Steps());
+    ICpuKernel::configure(win_config);
+
+    // Check if we need to clamp the result using min and max
+    const bool is_bounded_relu = !(min <= -32768 && max >= 32767);
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
+}
+
+Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    return Status{};
+}
+
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    (this->*_func)(src, bias, dst, window);
+}
+
+const char *CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::name() const
+{
+    return "CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..925788b680
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
+ *
+ */
+class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
+{
+public:
+    CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  src                          Input tensor info. Data type supported: S32
+     * @param[in]  bias                         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] dst                          Output tensor info. Data type supported: Data type supported: QSYMM16
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
+     */
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          min = 0,
+                   int          max = 0);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+     *
+     * @param[in]  src    Input tensor info
+     * @param[in]  bias   Bias tensor info
+     * @param[out] dst    Output tensor info
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+
+    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions
+     *
+     * @param[in]  src    Input tensor info
+     * @param[in]  bias   Bias tensor info
+     * @param[out] dst    Output tensor info
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _min{0};
+    int                     _max{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..0e58097073
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
+
+    // Check biases if exist
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
+    }
+
+    return Status{};
+}
+} // namespace
+
+template <bool is_bounded_relu>
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                             const ITensor *bias,
+                                                                             ITensor       *dst,
+                                                                             const Window  &window)
+{
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
+    const int8x16_t min_s8                        = vdupq_n_s8(static_cast<int8_t>(_min));
+    const int8x16_t max_s8                        = vdupq_n_s8(static_cast<int8_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_s8, max_s8);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win_collapsed);
+    Iterator out(dst, win_collapsed);
+    if (bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias_i(bias, win_biases);
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
+    }
+    else
+    {
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
+    }
+}
+
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                          ITensorInfo *bias,
+                                                                          ITensorInfo *dst,
+                                                                          int          result_fixedpoint_multiplier,
+                                                                          int          result_shift,
+                                                                          int          result_offset_after_shift,
+                                                                          int          min,
+                                                                          int          max)
+{
+    ARM_COMPUTE_UNUSED(bias);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max));
+
+    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+    _result_shift                 = result_shift;
+    _result_offset_after_shift    = result_offset_after_shift;
+    _min                          = min;
+    _max                          = max;
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8_SIGNED));
+
+    // Configure kernel window
+    Window win_config = calculate_max_window(*src, Steps());
+    ICpuKernel::configure(win_config);
+
+    // Check if we need to clamp the result using min and max
+    const bool is_bounded_relu = !(min <= -128 && max >= 127);
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
+}
+
+Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
+    return Status{};
+}
+
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                       const Window     &window,
+                                                                       const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    (this->*_func)(src, bias, dst, window);
+}
+
+const char *CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::name() const
+{
+    return "CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..6a67ba4f19
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ */
+class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
+{
+public:
+    CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  src                          Input tensor info. Data type supported: S32
+     * @param[in]  bias                         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] dst                          Output tensor info. Data type supported: Data type supported: QASYMM8_SIGNED
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+     *
+     * @param[in]  src    Input tensor info
+     * @param[in]  bias   Bias tensor info
+     * @param[out] dst    Output tensor info
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+
+    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions
+     *
+     * @param[in]  src    Input tensor info
+     * @param[in]  bias   Bias tensor info
+     * @param[out] dst    Output tensor info
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..e3dd2240ca
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
+
+    // Check biases if exist
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
+    }
+
+    return Status{};
+}
+} // namespace
+
+template <bool is_bounded_relu>
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
+{
+    const int32x4_t  result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
+    const uint8x16_t min_u8                        = vdupq_n_u8(static_cast<uint8_t>(_min));
+    const uint8x16_t max_u8                        = vdupq_n_u8(static_cast<uint8_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_u8);
+    ARM_COMPUTE_UNUSED(max_u8);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win_collapsed);
+    Iterator out(dst, win_collapsed);
+    if (bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias_i(bias, win_biases);
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
+    }
+    else
+    {
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
+            {
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
+    }
+}
+
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          result_offset_after_shift,
+                                                                           int          min,
+                                                                           int          max)
+{
+    ARM_COMPUTE_UNUSED(bias);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max));
+
+    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+    _result_shift                 = result_shift;
+    _result_offset_after_shift    = result_offset_after_shift;
+    _min                          = min;
+    _max                          = max;
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8));
+
+    // Configure kernel window
+    auto win_config = calculate_max_window(*src, Steps());
+    ICpuKernel::configure(win_config);
+
+    // Check if we need to clamp the result using min and max
+    const bool is_bounded_relu = !(min <= 0 && max >= 255);
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
+}
+
+Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
+    return Status{};
+}
+
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    (this->*_func)(src, bias, dst, window);
+}
+
+const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() const
+{
+    return "CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..45bd742a70
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
+{
+public:
+    CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  src                          Input tensor info. Data type supported: S32
+     * @param[in]  bias                         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] dst                          Output tensor info. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Template function to run the CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+
+    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
new file mode 100644
index 0000000000..fb1b70b91f
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/cpu/kernels/gemm_matrix_add/list.h"
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)},
+    {"neon_fp16_gemm_matrix_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)},
+
+};
+} // namespace
+
+void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta)
+{
+    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta));
+
+    _beta         = beta;
+    const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+    _func = uk->ukernel;
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+    ICPPKernel::configure(win);
+}
+
+Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_UNUSED(beta);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+    return Status{};
+}
+
+void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    if (_beta != 0.0f)
+    {
+        (*_func)(src, dst, window, _beta);
+    }
+}
+
+const char *CpuGemmMatrixAdditionKernel::name() const
+{
+    return "CpuGemmMatrixAdditionKernel";
+}
+
+const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &
+CpuGemmMatrixAdditionKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
new file mode 100644
index 0000000000..5e12f1dcbd
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
+ *
+ * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
+ *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel
+ *        - MTX_1 = C
+ */
+class CpuGemmMatrixAdditionKernel : public ICpuKernel<CpuGemmMatrixAdditionKernel>
+{
+private:
+    using GemmMatrixAddKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, float)>::type;
+
+public:
+    struct GemmMatrixAddKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        GemmMatrixAddKernelPtr       ukernel;
+    };
+    CpuGemmMatrixAdditionKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixAdditionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * @param[in]      src  Input tensor info (Matrix C). Data types supported: F16/F32
+     * @param[in, out] dst  Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref CpuGemmMatrixMultiplyKernel. Data type supported: the same as @p src.
+     * @param[in]      beta Weight of matrix C
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixAdditionKernel.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * Similar to @ref CpuGemmMatrixAdditionKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    static const std::vector<GemmMatrixAddKernel> &get_available_kernels();
+
+private:
+    /** Common signature for all the matrix addition functions
+     *
+     * @param[in]  src    An input tensor. Data types supported: F16/F32
+     * @param[out] dst    The output tensor. Data type supported: same as @p src
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  beta   Weight of matrix C
+     */
+    /** Matrix addition function to use for the particular tensor types passed to configure() */
+    GemmMatrixAddKernelPtr _func{nullptr};
+    float                  _beta{0.f};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..beccd94844
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/gemm_matrix_mul/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)},
+    {"neon_fp16_gemm_matrix_mul",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)},
+};
+
+inline Status validate_arguments(const ITensorInfo     *lhs,
+                                 const ITensorInfo     *rhs,
+                                 const ITensorInfo     *dst,
+                                 float                  alpha,
+                                 bool                   is_interleaved,
+                                 const GEMMReshapeInfo &reshape_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
+
+    if (!is_interleaved)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1));
+
+        if (dst->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0));
+            ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+        }
+    }
+    else
+    {
+        const int m                         = reshape_info.m();
+        const int n                         = reshape_info.n();
+        const int k                         = reshape_info.k();
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        /* Interleave */
+        TensorShape tensor_shape0{lhs->tensor_shape()};
+        tensor_shape0.set(0, k);
+        tensor_shape0.set(1, m);
+
+        const TensorInfo tensor_info0          = lhs->clone()->set_tensor_shape(tensor_shape0);
+        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(
+            misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0);
+
+        if (n != 0) /* Transpose */
+        {
+            TensorShape tensor_shape1{rhs->tensor_shape()};
+            tensor_shape1.set(0, n);
+            tensor_shape1.set(1, k);
+
+            const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1);
+            const TensorInfo tensor_info_reshaped1 =
+                rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(
+                    tensor_info1, mult_transpose1xW_width));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1);
+        }
+
+        if (dst->total_size() != 0)
+        {
+            if (n != 0)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n));
+            }
+            ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(1) != static_cast<size_t>(m));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+        }
+    }
+
+    return Status{};
+}
+
+} // namespace
+
+void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo     *lhs,
+                                            const ITensorInfo     *rhs,
+                                            ITensorInfo           *dst,
+                                            float                  alpha,
+                                            bool                   is_interleaved,
+                                            const GEMMReshapeInfo &reshape_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+    // dst tensor auto inizialitation if not yet initialized
+    TensorShape tensor_shape{lhs->tensor_shape()};
+    tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0));
+    tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1));
+
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(tensor_shape));
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
+
+    _alpha = alpha;
+
+    // Configure kernel window
+    Window win{};
+
+    // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    const bool is_dst_vector = (dst->dimension(1) == 1);
+    if (is_dst_vector)
+    {
+        const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32;
+
+        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
+    }
+    else
+    {
+        constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    }
+
+    const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(
+        DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+    _func = uk->ukernel;
+
+    ICPPKernel::configure(win);
+}
+
+Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo     *lhs,
+                                             const ITensorInfo     *rhs,
+                                             const ITensorInfo     *dst,
+                                             float                  alpha,
+                                             bool                   is_interleaved,
+                                             const GEMMReshapeInfo &reshape_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
+
+    return Status{};
+}
+
+void CpuGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const ITensor *lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const bool is_dst_vector = (dst->info()->dimension(1) == 1);
+    (*_func)(lhs, rhs, dst, window, info, _alpha, is_dst_vector);
+}
+
+const char *CpuGemmMatrixMultiplyKernel::name() const
+{
+    return "CpuGemmMatrixMultiplyKernel";
+}
+
+const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &
+CpuGemmMatrixMultiplyKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..765fcb8275
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p lhs and @p rhs are both matrices and reshaped respectively with @ref CpuGemmInterleave4x4Kernel" and @ref CpuGemmTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ */
+class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel>
+{
+private:
+    using GemmMatrixMulKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
+
+public:
+    struct GemmMatrixMulKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        GemmMatrixMulKernelPtr       ukernel;
+    };
+
+    CpuGemmMatrixMultiplyKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixMultiplyKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @note If the output tensor is a matrix, the input matrices @p lhs and @p rhs should be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
+     *       These two kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  lhs            Left-handside tensor info containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  rhs            Right-handside tensor info containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                            If the output tensor is a vector, rhs must contain the matrix B not reshaped. Data type supported: same as @p lhs
+     * @param[out] dst            Output tensor to store the result of matrix multiplication. Data type supported: same as @p lhs.
+     * @param[in]  alpha          Weight of the matrix product
+     * @param[in]  is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
+     * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped
+     */
+    void configure(const ITensorInfo     *lhs,
+                   const ITensorInfo     *rhs,
+                   ITensorInfo           *dst,
+                   float                  alpha,
+                   bool                   is_interleaved,
+                   const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel
+     *
+     * Similar to @ref CpuGemmMatrixMultiplyKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo     *lhs,
+                           const ITensorInfo     *rhs,
+                           const ITensorInfo     *dst,
+                           float                  alpha,
+                           bool                   is_interleaved,
+                           const GEMMReshapeInfo &reshape_info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    static const std::vector<GemmMatrixMulKernel> &get_available_kernels();
+
+private:
+    /** Common signature for all the matrix multiply functions
+     *
+     * @param[in]  lhs    Left-handside input tensor. Data types supported: F16/F32
+     * @param[in]  rhs    Right-handside input tensor. Data types supported: same as @p lhs
+     * @param[out] dst    The output tensor. Data type supported: same as @p rhs
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  info   Thread info metadata.
+     * @param[in]  alpha  Weight of the matrix product.
+     */
+
+    /** Matrix multiply function to use for the particular tensor types passed to configure() */
+    GemmMatrixMulKernelPtr _func{nullptr};
+    float                  _alpha{1.f};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H */
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
new file mode 100644
index 0000000000..c47746bc4b
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+void CpuGemmTranspose1xWKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*src)));
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmTranspose1xWKernel::validate(src, dst));
+
+    const size_t vector_size = 16 / src->element_size();
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(vector_size));
+    ICPPKernel::configure(win);
+}
+
+Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_transpose1xW_with_element_size_shape(*src));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+
+void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    /*
+     * Following an example of how the transposition1xW works when the src data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * The dst matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+     */
+
+    // Set window for dst tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    Iterator in(src, window);
+    Iterator out(dst, win_out);
+
+    const size_t in_width     = src->info()->dimension(0);
+    const size_t element_size = src->info()->element_size();
+    const size_t out_stride   = dst->info()->strides_in_bytes()[1];
+    const size_t vector_size  = 16 / element_size;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const uint8_t *in_ptr = in.ptr();
+            uint8_t *const out_ptr =
+                out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
+
+            for (size_t k = 0; k < vector_size; ++k)
+            {
+                // If the src width is not multiple of W, we fill the reference with 0s
+                if ((id.x() + k) >= in_width)
+                {
+                    std::memset(out_ptr + k * element_size, 0, element_size);
+                }
+                else
+                {
+                    std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+                }
+            }
+        },
+        in, out);
+}
+
+const char *CpuGemmTranspose1xWKernel::name() const
+{
+    return "CpuGemmTranspose1xWKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
new file mode 100644
index 0000000000..4b834b2cc6
--- /dev/null
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
+ *
+ * Following an example of how the transposition1xW works when the input data is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * Following an example of how the transposition1xW works when the input data type is F16
+ *
+ * @f[
+ * \left( \begin{array}{cccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\
+ * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\
+ * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\
+ * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ *
+ */
+class CpuGemmTranspose1xWKernel : public ICpuKernel<CpuGemmTranspose1xWKernel>
+{
+public:
+    CpuGemmTranspose1xWKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmTranspose1xWKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]  src Input tensor info. Data types supported: All
+     * @param[out] dst Output tensor info. Data type supported: same as @p src.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmTranspose1xWKernel
+     *
+     * Similar to @ref CpuGemmTranspose1xWKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H */
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
new file mode 100644
index 0000000000..39ba764c78
--- /dev/null
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuIm2ColKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/list.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace cpu
+{
+namespace kernels
+{
+void run_im2col_fp32_pad(const ITensor                        *src,
+                         ITensor                              *dst,
+                         const Window                         &window,
+                         DataLayout                            data_layout,
+                         const PadStrideInfo                  &conv_info,
+                         std::pair<unsigned int, unsigned int> convolved_dims,
+                         const Size2D                         &kernel_dims,
+                         const Size2D                         &dilation,
+                         uint32_t                              input_pad_right,
+                         bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<float, true, false>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                              kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_fp32_nopad(const ITensor                        *src,
+                           ITensor                              *dst,
+                           const Window                         &window,
+                           DataLayout                            data_layout,
+                           const PadStrideInfo                  &conv_info,
+                           std::pair<unsigned int, unsigned int> convolved_dims,
+                           const Size2D                         &kernel_dims,
+                           const Size2D                         &dilation,
+                           uint32_t                              input_pad_right,
+                           bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<float, false, false>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                               kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+void run_im2col_bf16_pad(const ITensor                        *src,
+                         ITensor                              *dst,
+                         const Window                         &window,
+                         DataLayout                            data_layout,
+                         const PadStrideInfo                  &conv_info,
+                         std::pair<unsigned int, unsigned int> convolved_dims,
+                         const Size2D                         &kernel_dims,
+                         const Size2D                         &dilation,
+                         uint32_t                              input_pad_right,
+                         bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_bf16_nopad(const ITensor                        *src,
+                           ITensor                              *dst,
+                           const Window                         &window,
+                           DataLayout                            data_layout,
+                           const PadStrideInfo                  &conv_info,
+                           std::pair<unsigned int, unsigned int> convolved_dims,
+                           const Size2D                         &kernel_dims,
+                           const Size2D                         &dilation,
+                           uint32_t                              input_pad_right,
+                           bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+
+void run_im2col_int8_nopad_nhwc(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<int8_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_uint8_nopad_nhwc(const ITensor                        *src,
+                                 ITensor                              *dst,
+                                 const Window                         &window,
+                                 DataLayout                            data_layout,
+                                 const PadStrideInfo                  &conv_info,
+                                 std::pair<unsigned int, unsigned int> convolved_dims,
+                                 const Size2D                         &kernel_dims,
+                                 const Size2D                         &dilation,
+                                 uint32_t                              input_pad_right,
+                                 bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<uint8_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_qasymm8_pad_nhwc(const ITensor                        *src,
+                                 ITensor                              *dst,
+                                 const Window                         &window,
+                                 DataLayout                            data_layout,
+                                 const PadStrideInfo                  &conv_info,
+                                 std::pair<unsigned int, unsigned int> convolved_dims,
+                                 const Size2D                         &kernel_dims,
+                                 const Size2D                         &dilation,
+                                 uint32_t                              input_pad_right,
+                                 bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void internal_run_im2col_fp16_pad(const ITensor                        *src,
+                                  ITensor                              *dst,
+                                  const Window                         &window,
+                                  DataLayout                            data_layout,
+                                  const PadStrideInfo                  &conv_info,
+                                  std::pair<unsigned int, unsigned int> convolved_dims,
+                                  const Size2D                         &kernel_dims,
+                                  const Size2D                         &dilation,
+                                  uint32_t                              input_pad_right,
+                                  bool                                  has_bias)
+{
+/*
+   Note that when building with the option data_type_support=fp32 the fp16.cpp files won't be compiled and the linker
+   would fail with the error undefined arm_compute::cpu::kernels::run_im2col_fp16_pad.
+   To avoid this problem we only call to the actual fp16 kernel if ENABLE_FP16_KERNELS is defined.
+*/
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_pad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                   kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
+}
+
+void internal_run_im2col_fp16_nopad(const ITensor                        *src,
+                                    ITensor                              *dst,
+                                    const Window                         &window,
+                                    DataLayout                            data_layout,
+                                    const PadStrideInfo                  &conv_info,
+                                    std::pair<unsigned int, unsigned int> convolved_dims,
+                                    const Size2D                         &kernel_dims,
+                                    const Size2D                         &dilation,
+                                    uint32_t                              input_pad_right,
+                                    bool                                  has_bias)
+{
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_nopad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                     kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
+}
+
+void internal_run_im2col_fp16_nchw_pad(const ITensor                        *src,
+                                       ITensor                              *dst,
+                                       const Window                         &window,
+                                       DataLayout                            data_layout,
+                                       const PadStrideInfo                  &conv_info,
+                                       std::pair<unsigned int, unsigned int> convolved_dims,
+                                       const Size2D                         &kernel_dims,
+                                       const Size2D                         &dilation,
+                                       uint32_t                              input_pad_right,
+                                       bool                                  has_bias)
+{
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_nchw_pad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                        kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
+}
+
+void internal_run_im2col_fp16_nchw_nopad(const ITensor                        *src,
+                                         ITensor                              *dst,
+                                         const Window                         &window,
+                                         DataLayout                            data_layout,
+                                         const PadStrideInfo                  &conv_info,
+                                         std::pair<unsigned int, unsigned int> convolved_dims,
+                                         const Size2D                         &kernel_dims,
+                                         const Size2D                         &dilation,
+                                         uint32_t                              input_pad_right,
+                                         bool                                  has_bias)
+{
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_nchw_nopad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                          kernel_dims, dilation, input_pad_right, has_bias);
+#else  //  defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
+}
+
+namespace
+{
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *output,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups,
+                          unsigned int         input_pad_right)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
+
+    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
+    const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned     total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
+    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
+
+    if (output->total_size() > 0)
+    {
+        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(
+            input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuIm2ColKernel::configure(const ITensorInfo   *src,
+                                ITensorInfo         *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups,
+                                unsigned int         input_pad_right)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_UNUSED(num_groups);
+
+    _data_layout                   = src->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+    _conv_info       = conv_info;
+    _kernel_width    = kernel_dims.width;
+    _kernel_height   = kernel_dims.height;
+    _input_pad_right = input_pad_right;
+    _dilation        = dilation;
+    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width,
+                                         _kernel_height, _conv_info, _dilation);
+    _has_bias        = has_bias;
+
+    if (_data_layout == DataLayout::NCHW)
+    {
+        switch (src->data_type())
+        {
+            case DataType::F32:
+                _func = (!conv_info.has_padding()) ? &run_im2col_fp32_nchw_nopad : &run_im2col_fp32_nchw_pad;
+                break;
+            case DataType::F16:
+                _func = (!conv_info.has_padding()) ? &internal_run_im2col_fp16_nchw_nopad
+                                                   : &internal_run_im2col_fp16_nchw_pad;
+                break;
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+            case DataType::BFLOAT16:
+                _func = (!conv_info.has_padding()) ? &run_im2col_bf16_nchw_nopad : &run_im2col_bf16_nchw_pad;
+                break;
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QASYMM8:
+                _func = (!conv_info.has_padding()) ? &run_im2col_qasymm8_nchw_nopad : &run_im2col_qasymm8_nchw_pad;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
+    }
+    else
+    {
+        switch (src->data_type())
+        {
+            case DataType::F32:
+                _func = (!conv_info.has_padding()) ? &run_im2col_fp32_nopad : &run_im2col_fp32_pad;
+                break;
+            case DataType::F16:
+                _func = (!conv_info.has_padding()) ? &internal_run_im2col_fp16_nopad : &internal_run_im2col_fp16_pad;
+                break;
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+            case DataType::BFLOAT16:
+                _func = (!conv_info.has_padding()) ? &run_im2col_bf16_nopad : &run_im2col_bf16_pad;
+                break;
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+            case DataType::QASYMM8:
+                _func = (!conv_info.has_padding()) ? &run_im2col_uint8_nopad_nhwc : &run_im2col_qasymm8_pad_nhwc;
+                break;
+            case DataType::QASYMM8_SIGNED:
+                _func = (!conv_info.has_padding()) ? &run_im2col_int8_nopad_nhwc : &run_im2col_qasymm8_pad_nhwc;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
+    }
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation,
+                                                                       false, num_groups, _input_pad_right)));
+
+    std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height,
+                          conv_info, dilation);
+
+    Window win = calculate_max_window(*src, Steps());
+    win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
+    win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
+    win.set(channel_idx, Window::Dimension(0, 1, 1));
+    // Configure kernel window
+    ICpuKernel::configure(win);
+}
+
+Status CpuIm2ColKernel::validate(const ITensorInfo   *src,
+                                 const ITensorInfo   *dst,
+                                 const Size2D        &kernel_dims,
+                                 const PadStrideInfo &conv_info,
+                                 bool                 has_bias,
+                                 const Size2D        &dilation,
+                                 unsigned int         num_groups,
+                                 unsigned int         input_pad_right)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    return Status{};
+}
+
+void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    _func(src, dst, window, _data_layout, _conv_info, _convolved_dims, Size2D(_kernel_width, _kernel_height), _dilation,
+          _input_pad_right, _has_bias);
+}
+
+const char *CpuIm2ColKernel::name() const
+{
+    return "CpuIm2ColKernel";
+}
+
+size_t CpuIm2ColKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+
+    return ICPPKernel::default_mws;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h
new file mode 100644
index 0000000000..ae7162cccf
--- /dev/null
+++ b/src/cpu/kernels/CpuIm2ColKernel.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H
+
+#include "arm_compute/core/Size2D.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
+{
+public:
+    /** Default constructor */
+    CpuIm2ColKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  src             The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
+     *                             Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
+     * @param[out] dst             The output tensor info. Data types supported: Same as @p input
+     * @param[in]  kernel_dims     The kernel dimensions (width and height).
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
+     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     * @param[in]  input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
+     */
+    void configure(const ITensorInfo   *src,
+                   ITensorInfo         *dst,
+                   const Size2D        &kernel_dims,
+                   const PadStrideInfo &conv_info,
+                   bool                 has_bias,
+                   const Size2D        &dilation        = Size2D(1U, 1U),
+                   unsigned int         num_groups      = 1,
+                   unsigned int         input_pad_right = 0);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuIm2ColKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *dst,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation        = Size2D(1U, 1U),
+                           unsigned int         num_groups      = 1,
+                           unsigned int         input_pad_right = 0);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+private:
+    /** Common signature for all the specialised im2col functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Im2ColFunctionPtr = void (*)(const ITensor                        *src,
+                                       ITensor                              *dst,
+                                       const Window                         &window,
+                                       DataLayout                            data_layout,
+                                       const PadStrideInfo                  &conv_info,
+                                       std::pair<unsigned int, unsigned int> convolved_dims,
+                                       const Size2D                         &kernel_dims,
+                                       const Size2D                         &dilation,
+                                       uint32_t                              input_pad_right,
+                                       bool                                  has_bias);
+
+    Im2ColFunctionPtr                     _func{nullptr};
+    std::pair<unsigned int, unsigned int> _convolved_dims{};
+    PadStrideInfo                         _conv_info{};
+    unsigned int                          _kernel_width{0};
+    unsigned int                          _kernel_height{0};
+    unsigned int                          _input_pad_right{0};
+    bool                                  _has_bias{false};
+    Size2D                                _dilation{1U, 1U};
+    DataLayout                            _data_layout{DataLayout::UNKNOWN};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
new file mode 100644
index 0000000000..7c1e4772a6
--- /dev/null
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
+#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+// Selector data types
+struct DataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+};
+
+struct DataTypeDataLayoutISASelectorData
+{
+    DataType                   dt;
+    DataLayout                 dl;
+    const cpuinfo::CpuIsaInfo &isa;
+};
+
+struct CastDataTypeISASelectorData
+{
+    DataType                   src_dt;
+    DataType                   dst_dt;
+    const cpuinfo::CpuIsaInfo &isa;
+};
+
+struct PoolDataTypeISASelectorData
+{
+    DataType            dt;
+    DataLayout          dl;
+    int                 pool_stride_x;
+    Size2D              pool_size;
+    cpuinfo::CpuIsaInfo isa;
+};
+
+struct ElementwiseDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+    int                 op;
+};
+struct DepthwiseConv2dNativeDataTypeISASelectorData
+{
+    DataType                   weights_dt;
+    DataType                   source_dt;
+    const cpuinfo::CpuIsaInfo &isa;
+};
+
+struct ActivationDataTypeISASelectorData
+{
+    DataType                   dt;
+    const CPUModel            &cpumodel;
+    const cpuinfo::CpuIsaInfo &isa;
+    const ActivationFunction   f;
+};
+
+struct CpuAddKernelDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+    bool                can_use_fixedpoint;
+};
+
+struct ScaleKernelDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+    InterpolationPolicy interpolation_policy;
+};
+
+struct SoftmaxKernelDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+    bool                is_log;
+    int                 axis;
+    unsigned long       sme2_vector_length;
+};
+
+// Selector pointer types
+using DataTypeISASelectorPtr            = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using DataTypeDataLayoutSelectorPtr     = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
+using PoolDataTypeISASelectorPtr        = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
+using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
+using DepthwiseConv2dNativeDataTypeISASelectorPtr =
+    std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
+using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
+using ActivationDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
+using CpuAddKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
+using ScaleKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
+using SoftmaxKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type;
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
new file mode 100644
index 0000000000..bcaa76b99b
--- /dev/null
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/maxunpool/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = {
+    {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(neon_fp32_maxunpooling)},
+    {"neon_fp16_maxunpooling",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_maxunpooling)},
+    {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)},
+    {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)},
+};
+
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *indices,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices);
+
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
+    const Size2D pool_size(pool_size_x, pool_size_y);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo      *src,
+                                           const ITensorInfo      *indices,
+                                           ITensorInfo            *dst,
+                                           const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info));
+    ARM_COMPUTE_UNUSED(indices);
+
+    const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+    _run_method = uk->ukernel;
+
+    const TensorShape output_shape = compute_unpool_shape(*src, pool_info);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
+
+    auto window = calculate_max_window(*src, Steps());
+    ICpuKernel::configure(window);
+}
+
+Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo      *src,
+                                            const ITensorInfo      *indices,
+                                            const ITensorInfo      *dst,
+                                            const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info));
+    return Status{};
+}
+
+void CpuMaxUnpoolingLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto indices = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto dst     = tensors.get_tensor(TensorType::ACL_DST);
+
+    _run_method(src, indices, dst, window);
+}
+
+const char *CpuMaxUnpoolingLayerKernel::name() const
+{
+    return "CpuMaxUnpoolingLayerKernel";
+}
+
+const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> &CpuMaxUnpoolingLayerKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
new file mode 100644
index 0000000000..5a641a2bea
--- /dev/null
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPUMAXUNPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_CPUMAXUNPOOLINGLAYERKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class CpuMaxUnpoolingLayerKernel : public ICpuKernel<CpuMaxUnpoolingLayerKernel>
+{
+private:
+    using MaxUnpoolingUKernelPtr = std::add_pointer<void(
+        const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
+
+public:
+    /** Default constructor */
+    CpuMaxUnpoolingLayerKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMaxUnpoolingLayerKernel);
+
+    /** Configure kernel for a given list of arguments
+     *
+     * @note Dst shape must be equal to the shape of the original src to pool.
+     *
+     * @param[in]  src       Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices   Tensor containing the offset to store the src elements in the dst tensor.
+     *                       @ref CpuMaxUnpooling with indices should precede this function in order to
+     *                       properly reconstruct the output tensor.
+     *                       The tensor shape of this tensor has to be equal to the src tensor shape. Data type supported: U32.
+     * @param[out] dst       Destination tensor. Data types supported: Same as @p src
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel
+     *
+     * @param[in]  src       Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices   Tensor info of the indices of the maximal values. Data type supported: U32.
+     * @param[out] dst       Destination tensor. Data types supported: Same as @p src
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+    struct MaxUnpoolingKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        MaxUnpoolingUKernelPtr       ukernel;
+    };
+
+    static const std::vector<MaxUnpoolingKernel> &get_available_kernels();
+
+    const char *name() const override;
+
+private:
+    MaxUnpoolingUKernelPtr _run_method{nullptr};
+};
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPUMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
new file mode 100644
index 0000000000..8001482154
--- /dev/null
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -0,0 +1,1831 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuMulKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/mul/generic/neon/list.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#if defined(ENABLE_FP32_KERNELS)
+static constexpr size_t default_mws_N1_fp32_neon = 22447;
+static constexpr size_t default_mws_V1_fp32_neon = 38982;
+#endif /* ENABLE_FP32_KERNELS */
+static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
+} // namespace
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+const float       scale255_constant      = 1.f / 255.f;
+const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
+const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
+
+inline Status validate_arguments(const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 float              scale,
+                                 ConvertPolicy      overflow_policy,
+                                 RoundingPolicy     rounding_policy)
+{
+    ARM_COMPUTE_UNUSED(overflow_policy);
+    ARM_COMPUTE_UNUSED(rounding_policy);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP,
+                                        "ConvertPolicy cannot be WRAP if datatype is quantized");
+    }
+
+    if (dst->total_size() > 0)
+    {
+        const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+        // clang-format off
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
+            !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
+            !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
+            !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
+            !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
+            !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
+            , "Invalid data type combination");
+        // clang-format on
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 &&
+                                            scale != 1.f,
+                                        "Unsupported scale for QSYMM16 inputs and S32 dst");
+    }
+
+    if (std::abs(scale - scale255_constant) < 0.00001f)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP &&
+                                    rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 &&
+                                            dst->data_type() == DataType::S32,
+                                        "Scale == 1/255 is not supported if input and dst are of data type S32");
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
+
+        int         exponent            = 0;
+        const float normalized_mantissa = std::frexp(scale, &exponent);
+
+        // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+        // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+        // Moreover, it will be negative as we deal with 1/2^n
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
+                                        "Scale value not supported (Should be 1/(2^n) or 1/255");
+    }
+
+    return Status{};
+}
+
+/* Scales a given vector by 1/255.
+ *
+ * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
+ *
+ * @param in Input vector to scale.
+ * @return   Scaled dst rounded to nearest (round half up).
+ */
+inline int32x4_t scale255_S32_S32(int32x4_t in)
+{
+    // Scale
+    const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
+    // Round to nearest (round half up)
+    // Add +0.5 for all values
+    // Afterwards vcvt rounds toward zero
+    return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
+}
+
+inline uint16x8_t scale255_U16_U16(uint16x8_t in)
+{
+    const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
+    const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
+    return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
+vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
+{
+    return vquantize_signed(val, info);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
+vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
+{
+    return vquantize(val, info);
+}
+
+template <typename T>
+void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 16 / sizeof(T);
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
+    const UniformQuantizationInfo tmp_qua_info    = {output_qua_info.scale / scale, output_qua_info.offset};
+
+    if (is_broadcast_across_x)
+    {
+        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
+        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
+        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator dst(out, win);
+
+        using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
+
+                const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
+
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(non_broadcast_input_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
+    }
+    else
+    {
+        const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
+        const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
+
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src1, input1_win);
+        Iterator input2(src2, input2_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto input1_q = wrapper::vloadq(input1_ptr + x);
+                    const auto input2_q = wrapper::vloadq(input2_ptr + x);
+
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
+
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(input1_ptr + x);
+                    const T     src2    = *(input2_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            input1, input2, dst);
+    }
+}
+
+bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                     const ITensorInfo *src1,
+                                     const ITensorInfo *dst,
+                                     float              scale)
+{
+    const auto iq0 = src0->quantization_info().uniform();
+    const auto iq1 = src1->quantization_info().uniform();
+    const auto oq  = dst->quantization_info().uniform();
+
+    const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
+
+    if (multiplier < -8191.f || multiplier > 8191.f)
+    {
+        //The multiplier cannot be stored as a 14.18 signed fixed-point number
+        return false;
+    }
+
+    const auto offset_out = float(oq.offset);
+
+    const auto max_result = multiplier * (256) * (256) + offset_out;
+
+    if (max_result > 8191.f)
+    {
+        //It might not be possible to store the result as a 14.18 signed fixed-point number.
+        return false;
+    }
+
+    return true;
+}
+
+template <typename ScalarType>
+void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)
+{
+    const auto in0_info = src0->info();
+    const auto in1_info = src1->info();
+
+    const auto &in0_shape = in0_info->tensor_shape();
+    const auto &in1_shape = in1_info->tensor_shape();
+
+    // Create input windows.
+    Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
+    Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
+
+    // Clear the x dimension on the execution window as we process the whole row each iteration.
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16;
+    const auto    window_start_x        = window.x().start();
+    const auto    window_end_x          = window.x().end();
+    const auto    is_broadcast_across_x = in0_shape.x() != in1_shape.x();
+
+    const auto iq0_info = in0_info->quantization_info().uniform();
+    const auto iq1_info = in1_info->quantization_info().uniform();
+    const auto oq_info  = dst->info()->quantization_info().uniform();
+
+    const auto in0_offset = iq0_info.offset;
+    const auto in1_offset = iq1_info.offset;
+    const auto out_offset = oq_info.offset;
+    const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;
+
+    constexpr int32_t two_pwr18i = 262144;
+    constexpr float   two_pwr18f = 262144.f;
+
+    const auto in0_offset_16p0  = static_cast<int16_t>(in0_offset);
+    const auto in1_offset_16p0  = static_cast<int16_t>(in1_offset);
+    const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
+    const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
+
+    if (is_broadcast_across_x)
+    {
+        // Prefix: a = non-broadcast, b = broadcast.
+
+        const auto is_broadcast_input_1 = in1_win.x().step() == 0;
+        auto       a_win                = is_broadcast_input_1 ? in0_win : in1_win;
+        auto       b_win                = is_broadcast_input_1 ? in1_win : in0_win;
+        const auto a_tensor             = is_broadcast_input_1 ? src0 : src1;
+        const auto b_tensor             = is_broadcast_input_1 ? src1 : src0;
+
+        const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;
+        const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;
+#ifndef __aarch64__
+        const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;
+        const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;
+#endif //__aarch64__
+        const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());
+
+        // Clear the x dimension on the execution window as we process the whole row each iteration.
+        a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator a_input_it(a_tensor, a_win);
+        Iterator b_input_it(b_tensor, b_win);
+        Iterator out_it(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                const auto b_val            = *b_ptr;
+                const auto b_offseted_32p0  = static_cast<int32_t>(b_val - b_offset_16p0);
+                const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
+
+                const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
+                const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
+
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+                    const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
+                    const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
+
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
+
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+                    const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01);
+
+                    const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11);
+                    const auto out_ptr     = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
+#ifdef __aarch64__
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(
+                        (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) +
+                        out_offset_14p18)));
+#else  //__aarch64__
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
+#endif //__aarch64__
+                }
+            },
+            a_input_it, b_input_it, out_it);
+    }
+    else
+    {
+        const auto voffset0_16p0     = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());
+        const auto voffset1_16p0     = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());
+        const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
+        const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
+
+        // Clear the x dimension on the execution window as we process the whole row each iteration.
+        in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator in0_it(src0, in0_win);
+        Iterator in1_it(src1, in1_win);
+        Iterator out_it(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+                    const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
+                    const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
+
+                    const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
+                    const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
+
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
+
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+                    const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01);
+
+                    const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11);
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
+#ifdef __aarch64__
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(
+                        wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) *
+                                             (int32_t(in1_ptr[x]) - in1_offset_16p0)) +
+                                            out_offset_14p18)));
+#else  //__aarch64__
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) +
+                        float(out_offset)));
+#endif //__aarch64__
+                }
+            },
+            in0_it, in1_it, out_it);
+    }
+}
+
+void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
+    const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+{
+    const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
+    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
+
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(src1, input1_win);
+    Iterator input2(src2, input2_win);
+    Iterator dst(out, win);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const qsymm16x8x2_t input1_q = {{
+                    vld1q_s16(input1_ptr + x),
+                    vld1q_s16(input1_ptr + x + 8),
+                }};
+                const qsymm16x8x2_t input2_q = {{
+                    vld1q_s16(input2_ptr + x),
+                    vld1q_s16(input2_ptr + x + 8),
+                }};
+
+                // Dequantize inputs
+                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+                const float32x4x4_t out_f32x4x4 = {
+                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                };
+
+                const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                // Dequantize inputs
+                float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
+                float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
+                float tmp_f   = tmp_in1 * tmp_in2;
+
+                // Quantize dst, lrintf() has same rounding mode as vcombine_s16
+                int32_t   tmp = lrintf(tmp_f / tmp_qua_info.scale);
+                qsymm16_t tmp_qua =
+                    static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                *(output_ptr + x) = tmp_qua;
+            }
+        },
+        input1, input2, dst);
+}
+
+void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
+{
+    ARM_COMPUTE_UNUSED(scale);
+
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(src1, input1_win);
+    Iterator input2(src2, input2_win);
+    Iterator dst(out, win);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const qsymm16x8x2_t input1_q = {{
+                    vld1q_s16(input1_ptr + x),
+                    vld1q_s16(input1_ptr + x + 8),
+                }};
+                const qsymm16x8x2_t input2_q = {{
+                    vld1q_s16(input2_ptr + x),
+                    vld1q_s16(input2_ptr + x + 8),
+                }};
+
+                const int32x4x4_t in1_s32 = {{
+                    vmovl_s16(vget_low_s16(input1_q.val[0])),
+                    vmovl_s16(vget_high_s16(input1_q.val[0])),
+                    vmovl_s16(vget_low_s16(input1_q.val[1])),
+                    vmovl_s16(vget_high_s16(input1_q.val[1])),
+                }};
+                const int32x4x4_t in2_s32 = {{
+                    vmovl_s16(vget_low_s16(input2_q.val[0])),
+                    vmovl_s16(vget_high_s16(input2_q.val[0])),
+                    vmovl_s16(vget_low_s16(input2_q.val[1])),
+                    vmovl_s16(vget_high_s16(input2_q.val[1])),
+                }};
+
+                const int32x4x4_t result = {{
+                    vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
+                    vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
+                    vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
+                    vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
+                }};
+
+                vst1q_s32(output_ptr + x, result.val[0]);
+                vst1q_s32(output_ptr + x + 4, result.val[1]);
+                vst1q_s32(output_ptr + x + 8, result.val[2]);
+                vst1q_s32(output_ptr + x + 12, result.val[3]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input1, input2, dst);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(src1, input1_win);
+    Iterator input2(src2, input2_win);
+    Iterator dst(out, win);
+
+    const int  window_step_x  = 16 / sizeof(uint8_t);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
+                const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
+
+                uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
+                const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+                uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
+                const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+
+                tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+                tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+
+                if (is_scale255)
+                {
+                    tmp1_high = scale255_U16_U16(tmp1_high);
+                    tmp1_low  = scale255_U16_U16(tmp1_low);
+                }
+                else
+                {
+                    const int16x8_t vn = vdupq_n_s16(-n);
+
+                    if (is_sat)
+                    {
+                        tmp1_high = vqshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                    }
+                    else
+                    {
+                        tmp1_high = vshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vshlq_u16(tmp1_low, vn);
+                    }
+                }
+                if (is_sat)
+                {
+                    vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
+                }
+                else
+                {
+                    vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
+                }
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
+
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
+                if (is_sat && tmp > 255)
+                {
+                    tmp = 255;
+                }
+                *(output_ptr + x) = static_cast<uint8_t>(tmp);
+            }
+        },
+        input1, input2, dst);
+}
+
+template <bool is_scale255, bool is_sat>
+inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
+{
+    int32x4_t       tmp1_high = vmovl_s16(vget_high_s16(src1));
+    const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
+    int32x4_t       tmp1_low  = vmovl_s16(vget_low_s16(src1));
+    const int32x4_t tmp2_low  = vmovl_s16(vget_low_s16(src2));
+
+    tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
+    tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
+
+    if (is_scale255)
+    {
+        tmp1_high = scale255_S32_S32(tmp1_high);
+        tmp1_low  = scale255_S32_S32(tmp1_low);
+    }
+    else
+    {
+        // Right shift amount
+        const int32x4_t vn = vdupq_n_s32(-n);
+        // Left shift amount
+        const int32x4_t vnl = vdupq_n_s32(n);
+        // Calculate conversion bit
+        const uint32x4_t tmp1_high_u  = vreinterpretq_u32_s32(tmp1_high);
+        const uint32x4_t tmp1_low_u   = vreinterpretq_u32_s32(tmp1_low);
+        const uint32x4_t sign_high    = vshrq_n_u32(tmp1_high_u, 31);
+        const uint32x4_t sign_low     = vshrq_n_u32(tmp1_low_u, 31);
+        const int32x4_t  sign_high_s  = vreinterpretq_s32_u32(sign_high);
+        const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
+        const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
+        const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
+        if (is_sat)
+        {
+            tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
+        }
+        else
+        {
+            tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
+        }
+    }
+
+    if (is_sat)
+    {
+        return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
+    }
+    else
+    {
+        return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
+    }
+}
+
+template <bool is_scale255, bool is_sat>
+inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
+{
+    const int16x8x2_t result = {{// First 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}};
+
+    return result;
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(src1, input1_win);
+    Iterator input2(src2, input2_win);
+    Iterator dst(out, win);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int16x8x2_t ta1    = {{
+                       vld1q_s16(input1_ptr + x),
+                       vld1q_s16(input1_ptr + x + 8),
+                }};
+                const int16x8x2_t ta2    = {{
+                       vld1q_s16(input2_ptr + x),
+                       vld1q_s16(input2_ptr + x + 8),
+                }};
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
+                }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
+            }
+        },
+        input1, input2, dst);
+}
+
+template <bool is_sat>
+inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
+{
+    const int32x2_t input1_1 = vget_low_s32(src1);
+    const int32x2_t input2_1 = vget_low_s32(src2);
+    const int32x2_t input1_2 = vget_high_s32(src1);
+    const int32x2_t input2_2 = vget_high_s32(src2);
+
+    int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
+    int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
+
+    // Apply scaling, conversion and rounding (round to zero)
+    // Right shift amount
+    const int64x2_t vn = vdupq_n_s64(-n);
+    // Left shift amount
+    const int64x2_t vnl = vdupq_n_s64(n);
+    // Calculate conversion bit
+    const uint64x2_t tmp_1_u   = vreinterpretq_u64_s64(tmp_1);
+    const uint64x2_t sign_1    = vshrq_n_u64(tmp_1_u, 63);
+    const int64x2_t  sign_1_s  = vreinterpretq_s64_u64(sign_1);
+    const int64x2_t  convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
+
+    const uint64x2_t tmp_2_u   = vreinterpretq_u64_s64(tmp_2);
+    const uint64x2_t sign_2    = vshrq_n_u64(tmp_2_u, 63);
+    const int64x2_t  sign_2_s  = vreinterpretq_s64_u64(sign_2);
+    const int64x2_t  convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
+    if (is_sat)
+    {
+        tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
+        tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
+        return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
+    }
+    else
+    {
+        tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
+        tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
+        return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
+    }
+}
+
+template <bool is_sat>
+inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
+{
+    const int32x4x2_t result = {{// First 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}};
+
+    return result;
+}
+
+template <bool is_sat>
+void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 8;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
+
+                const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int32x4x2_t broadcast_v     = {{
+                            broadcast_value_vec,
+                            broadcast_value_vec,
+                    }};
+                    const int32x4x2_t non_broadcast_v = {{
+                        vld1q_s32(non_broadcast_input_ptr + x),
+                        vld1q_s32(non_broadcast_input_ptr + x + 4),
+                    }};
+                    const int32x4x2_t result          = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
+
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int64_t tmp =
+                        static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
+
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                    }
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src1, input1_win);
+        Iterator input2(src2, input2_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int32x4x2_t ta1    = {{
+                           vld1q_s32(input1_ptr + x),
+                           vld1q_s32(input1_ptr + x + 4),
+                    }};
+                    const int32x4x2_t ta2    = {{
+                           vld1q_s32(input2_ptr + x),
+                           vld1q_s32(input2_ptr + x + 4),
+                    }};
+                    const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
+
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
+
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                    }
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
+                }
+            },
+            input1, input2, dst);
+    }
+}
+
+void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 8 / sizeof(float);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
+
+    using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
+
+                const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
+                    float32x4_t b = vdupq_n_f32(broadcast_value);
+
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+
+                    float32x4_t res = wrapper::vmul(tmp0, b);
+                    b               = wrapper::vmul(b, mask);
+
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
+                    const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
+                    auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
+                    auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
+                    *(output_ptr + 2 * x)           = res1;
+                    *(output_ptr + 2 * x + 1)       = res2;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src1, input1_win);
+        Iterator input2(src2, input2_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
+                    float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
+
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+
+                    float32x4_t res = wrapper::vmul(tmp0, b);
+
+                    b = wrapper::vrev64(b);
+                    b = wrapper::vmul(b, mask);
+
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a0             = *(input1_ptr + 2 * x);
+                    const auto a1             = *(input1_ptr + 2 * x + 1);
+                    const auto b0             = *(input2_ptr + 2 * x);
+                    const auto b1             = *(input2_ptr + 2 * x + 1);
+                    auto       res1           = a0 * b0 - a1 * b1;
+                    auto       res2           = a0 * b1 + a1 * b0;
+                    *(output_ptr + 2 * x)     = res1;
+                    *(output_ptr + 2 * x + 1) = res2;
+                }
+            },
+            input1, input2, dst);
+    }
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(src1, input1_win);
+    Iterator input2(src2, input2_win);
+    Iterator dst(out, win);
+
+    const int  window_step_x  = 16 / sizeof(uint8_t);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
+                const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
+
+                uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
+                uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+                tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+                tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+                if (is_scale255)
+                {
+                    tmp_low  = scale255_U16_U16(tmp_low);
+                    tmp_high = scale255_U16_U16(tmp_high);
+                }
+                else
+                {
+                    const int16x8_t vn = vdupq_n_s16(-n);
+
+                    if (is_sat)
+                    {
+                        tmp_low  = vqshlq_u16(tmp_low, vn);
+                        tmp_high = vqshlq_u16(tmp_high, vn);
+                    }
+                    else
+                    {
+                        tmp_low  = vshlq_u16(tmp_low, vn);
+                        tmp_high = vshlq_u16(tmp_high, vn);
+                    }
+                }
+
+                if (is_sat)
+                {
+                    static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+
+                    tmp_low  = vminq_u16(tmp_low, max);
+                    tmp_high = vminq_u16(tmp_high, max);
+                }
+
+                vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
+                vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<int32_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
+
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+                }
+
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
+            }
+        },
+        input1, input2, dst);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(src1, input1_win);
+    Iterator input2(src2, input2_win);
+    Iterator dst(out, win);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int16x8x2_t ta1  = {{
+                     vld1q_s16(input1_ptr + x),
+                     vld1q_s16(input1_ptr + x + 8),
+                }};
+                const uint8x8x2_t ta2u = {{
+                    vld1_u8(input2_ptr + x),
+                    vld1_u8(input2_ptr + x + 8),
+                }};
+                const int16x8x2_t ta2  = {
+                     {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}};
+
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
+                }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
+            }
+        },
+        input1, input2, dst);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
+{
+    // Simply swap the two input buffers
+    mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
+}
+} // namespace
+
+void CpuMulKernel::configure(ITensorInfo   *src1,
+                             ITensorInfo   *src2,
+                             ITensorInfo   *dst,
+                             float          scale,
+                             ConvertPolicy  overflow_policy,
+                             RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_UNUSED(rounding_policy);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+    // Auto initialize dst if not initialized
+    set_shape_if_empty(*dst, out_shape);
+
+    _scale          = scale;
+    _scale_exponent = 0;
+    _func_quantized = nullptr;
+    _func_int       = nullptr;
+    _func_float     = nullptr;
+
+    bool is_scale_255 = false;
+    // Check and validate scaling factor
+    if (std::abs(scale - scale255_constant) < 0.00001f)
+    {
+        is_scale_255 = true;
+    }
+    else
+    {
+        int exponent = 0;
+
+        std::frexp(scale, &exponent);
+
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        _scale_exponent = std::abs(exponent - 1);
+    }
+
+    const DataType dt_input1 = src1->data_type();
+    const DataType dt_input2 = src2->data_type();
+    const DataType dt_output = dst->data_type();
+    const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
+
+    switch (dt_input1)
+    {
+        case DataType::QASYMM8:
+            if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
+            {
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                {
+                    _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
+                }
+                else
+                {
+                    _func_quantized = &mul_saturate_quantized_8<uint8_t>;
+                }
+            }
+            break;
+        case DataType::QASYMM8_SIGNED:
+            if (dt_input2 == DataType::QASYMM8_SIGNED)
+            {
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                {
+                    _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
+                }
+                else
+                {
+                    _func_quantized = &mul_saturate_quantized_8<int8_t>;
+                }
+            }
+            break;
+        case DataType::QSYMM16:
+            if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
+            {
+                _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
+            }
+            else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
+            {
+                _func_int = &mul_QSYMM16_QSYMM16_S32;
+            }
+            break;
+        case DataType::S16:
+            if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if (is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
+                }
+            }
+            if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if (is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
+                }
+            }
+            break;
+        case DataType::S32:
+            if (DataType::S32 == dt_input2 && DataType::S32 == dt_output)
+            {
+                _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
+            }
+            break;
+        case DataType::U8:
+            if (DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+            {
+                if (is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
+                }
+            }
+            else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if (is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
+                }
+            }
+            else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if (is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
+                }
+            }
+            break;
+        case DataType::F16:
+            _func_float = REGISTER_FP16_NEON(cpu::mul_F16_F16_F16);
+            break;
+        case DataType::F32:
+            _func_float = REGISTER_FP32_NEON(cpu::mul_F32_F32_F32);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong img formats");
+    }
+
+    // Configure kernel window
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);
+
+    ICpuKernel::configure(win);
+}
+
+size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if (this->_func_float == &mul_F32_F32_F32)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if (platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_mws_N1_fp32_neon;
+        }
+        else if (platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_mws_V1_fp32_neon;
+        }
+        else
+        {
+            if (_split_dimension == Window::DimX)
+            {
+                // Don't split the work load too small if the tensor has been reinterpreted as 1D.
+                // This number is loosely chosen as threading overhead in each platform varies wildly.
+                return default_mws_other_platforms_1d_tensor;
+            }
+            return default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if (this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else  /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    if (_split_dimension == Window::DimX)
+    {
+        // Don't split the work load too small if the tensor has been reinterpreted as 1D.
+        // This number is loosely chosen as threading overhead in each platform varies wildly.
+        return default_mws_other_platforms_1d_tensor;
+    }
+    return default_mws;
+}
+
+Status CpuMulKernel::validate(const ITensorInfo *src1,
+                              const ITensorInfo *src2,
+                              const ITensorInfo *dst,
+                              float              scale,
+                              ConvertPolicy      overflow_policy,
+                              RoundingPolicy     rounding_policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
+
+    return Status{};
+}
+
+void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    if (_func_quantized != nullptr)
+    {
+        (*_func_quantized)(src1, src2, dst, window, _scale);
+    }
+    else if (_func_int != nullptr)
+    {
+        (*_func_int)(src1, src2, dst, window, _scale_exponent);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
+        (*_func_float)(src1, src2, dst, window, _scale);
+    }
+}
+
+const char *CpuMulKernel::name() const
+{
+    return "CpuMulKernel";
+}
+
+namespace
+{
+Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured dst
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+    // Auto initialize dst if not initialized
+    const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
+    auto_init_if_empty(*dst, out_info);
+
+    // Configure kernel window
+    Window win = calculate_max_window(out_shape);
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
+
+    return Status{};
+}
+
+void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    c_mul_F32_F32_F32_n(src1, src2, dst, window);
+}
+
+const char *CpuComplexMulKernel::name() const
+{
+    return "CpuComplexMulKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
new file mode 100644
index 0000000000..7eaf287507
--- /dev/null
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H
+#define ARM_COMPUTE_CPU_MUL_KERNEL_H
+
+#include "arm_compute/core/Rounding.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform multiplication between two tensors */
+class CpuMulKernel : public ICpuKernel<CpuMulKernel>
+{
+public:
+    CpuMulKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel);
+    /** Initialise the kernel's input, dst and border mode.
+     *
+     * Valid configurations (Src1,Src2) -> Dst :
+     *
+     *                                                       Support: Broadcast? Scale=1/255?
+     *   - (U8,U8)                         -> U8, S16                 N          Y
+     *   - (U8,S16)                        -> S16                     N          Y
+     *   - (S16,U8)                        -> S16                     N          Y
+     *   - (S16,S16)                       -> S16                     N          Y
+     *   - (S32,S32)                       -> S32                     Y          N
+     *   - (F16,F16)                       -> F16                     N          Y
+     *   - (F32,F32)                       -> F32                     Y          Y
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in]  src1            First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in]  src2            Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[out] dst             Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     *                             If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
+     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    void configure(ITensorInfo   *src1,
+                   ITensorInfo   *src2,
+                   ITensorInfo   *dst,
+                   float          scale,
+                   ConvertPolicy  overflow_policy,
+                   RoundingPolicy rounding_policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuMulKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           float              scale,
+                           ConvertPolicy      overflow_policy,
+                           RoundingPolicy     rounding_policy);
+
+    // Inherited methods overridden
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+      *
+      * @return The split dimension hint.
+      */
+    size_t get_split_dimension_hint() const
+    {
+        return _split_dimension;
+    }
+
+private:
+    /** Common signature for all the specialised multiplication functions with integer scaling factor
+     *
+     * @param[in]  src1   Src1 tensor object.
+     * @param[in]  src2   Src2 tensor object.
+     * @param[out] dst    Dst tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Integer scale factor.
+     */
+    using MulFunctionInt =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
+    /** Common signature for all the specialised multiplication functions with float scaling factor
+     *
+     * @param[in]  src1   Src1 tensor object.
+     * @param[in]  src2   Src2 tensor object.
+     * @param[out] dst    Dst tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Float scale factor.
+     */
+    using MulFunctionFloat =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+    /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
+     *
+     * @param[in]  src1   Src1 tensor object.
+     * @param[in]  src2   Src2 tensor object.
+     * @param[out] dst    Dst tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Float scale factor.
+     *
+     */
+    using MulFunctionQuantized =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+
+    MulFunctionFloat     *_func_float{nullptr};
+    MulFunctionInt       *_func_int{nullptr};
+    MulFunctionQuantized *_func_quantized{nullptr};
+    float                 _scale{0};
+    int                   _scale_exponent{0};
+    size_t                _split_dimension{Window::DimY};
+};
+
+/** Interface for the complex pixelwise multiplication kernel. */
+class CpuComplexMulKernel : public ICpuKernel<CpuComplexMulKernel>
+{
+public:
+    CpuComplexMulKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel);
+    /** Initialise the kernel's src, dst and border mode.
+     *
+     * @param[in]  src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[in]  src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     * @param[out] dst  The dst tensor, Data types supported: same as @p src1.  Number of channels supported: same as @p src1.
+     */
+    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuComplexMulKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */
diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp
new file mode 100644
index 0000000000..b444a25ff7
--- /dev/null
+++ b/src/cpu/kernels/CpuPermuteKernel.cpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuPermuteKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace
+{
+#include "src/core/NEON/kernels/convolution/common/shims.hpp"
+} // namespace
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+inline bool is_permutation_supported(const PermutationVector &v)
+{
+    static const std::array<PermutationVector, 2>  permutations2 = {{
+         PermutationVector(0U, 1U),
+         PermutationVector(1U, 0U),
+    }};
+    static const std::array<PermutationVector, 6>  permutations3 = {{
+         PermutationVector(2U, 0U, 1U),
+         PermutationVector(1U, 2U, 0U),
+         PermutationVector(0U, 1U, 2U),
+         PermutationVector(0U, 2U, 1U),
+         PermutationVector(1U, 0U, 2U),
+         PermutationVector(2U, 1U, 0U),
+    }};
+    static const std::array<PermutationVector, 24> permutations4 = {
+        {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U),
+         PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U),
+         PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U),
+         PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U),
+         PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U),
+         PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U),
+         PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U),
+         PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}};
+
+    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) ||
+           (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) ||
+           (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported.");
+
+    const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
+
+    // Validate configured destination
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+
+template <typename T>
+void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm)
+{
+    const DataLayout src_layout = src->info()->data_layout();
+
+    // Source window
+    Window window_src = window;
+
+    // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
+    // we have to fall back to C++
+    if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) ||
+        (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}))
+    {
+        window_src.set(Window::DimX,
+                       Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+        window_src.set(Window::DimY,
+                       Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+        window_src.set(Window::DimZ,
+                       Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+        window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
+    }
+
+    // Destination window
+    Window                  window_dst(window);
+    const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
+    for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
+    {
+        window_dst.set(d, zero_window);
+    }
+
+    // Create iterators
+    Iterator src_it(src, window_src);
+    Iterator dst_it(dst, window_dst);
+
+    int in_row_stride     = 0;
+    int in_col_stride     = 0;
+    int in_channel_stride = 0;
+    int in_batch_stride   = 0;
+    int n_cols            = 0;
+    int n_rows            = 0;
+    int n_channels        = 0;
+    int n_batches         = 0;
+
+    switch (src_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            in_row_stride     = src->info()->strides_in_bytes().y() / sizeof(T);
+            in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T);
+            in_batch_stride   = src->info()->strides_in_bytes()[3] / sizeof(T);
+            n_cols            = src->info()->tensor_shape().x();
+            n_rows            = window_src.y().step();
+            n_channels        = src->info()->tensor_shape().z();
+            n_batches         = src->info()->tensor_shape()[3];
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            in_col_stride   = src->info()->strides_in_bytes().y() / sizeof(T);
+            in_row_stride   = src->info()->strides_in_bytes().z() / sizeof(T);
+            in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T);
+            n_channels      = src->info()->tensor_shape().x();
+            n_cols          = window_src.y().step();
+            n_rows          = src->info()->tensor_shape().z();
+            n_batches       = src->info()->tensor_shape()[3];
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid source data layout.");
+            break;
+        }
+    }
+
+    // CHW -> HWC
+    if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U})
+    {
+        const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
+        const int out_col_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
+        const int out_row_stride     = dst->info()->strides_in_bytes().z() / sizeof(T);
+        const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
+                reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols,
+                                      in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride,
+                                      out_row_stride, out_col_stride);
+            },
+            src_it, dst_it);
+    }
+    // HWC -> CHW
+    else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})
+    {
+        const int out_col_stride     = dst->info()->strides_in_bytes().x() / sizeof(T);
+        const int out_row_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
+        const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
+        const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
+                reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels,
+                                      in_batch_stride, in_row_stride, in_col_stride, out_batch_stride,
+                                      out_channel_stride, out_row_stride);
+            },
+            src_it, dst_it);
+    }
+    else
+    {
+        // All other cases fall back to C++
+        // Permute strides
+        Strides strides      = dst->info()->strides_in_bytes();
+        Strides perm_strides = strides;
+        permute_strides(perm_strides, perm);
+        const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx =
+                    id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
+                *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
+            },
+            src_it, dst_it);
+    }
+}
+} // namespace
+
+void CpuPermuteKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
+    // Destination auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
+
+    _perm = perm;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    // This kernel doesn't need padding so update_window_and_padding() can be skipped
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
+    return Status{};
+}
+
+void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    switch (src->info()->element_size())
+    {
+        case 1:
+            run_permute<uint8_t>(window, src, dst, _perm);
+            break;
+        case 2:
+            run_permute<uint16_t>(window, src, dst, _perm);
+            break;
+        case 4:
+            run_permute<uint32_t>(window, src, dst, _perm);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+}
+
+const char *CpuPermuteKernel::name() const
+{
+    return "CpuPermuteKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h
new file mode 100644
index 0000000000..0cb2faf223
--- /dev/null
+++ b/src/cpu/kernels/CpuPermuteKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
+#define ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to perform tensor permutation given a permutation vector */
+class CpuPermuteKernel : public ICpuKernel<CpuPermuteKernel>
+{
+public:
+    CpuPermuteKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPermuteKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in]  src  Srouce tensor to permute. Data types supported: All
+     * @param[out] dst  Destination tensor. Data types supported: Same as @p src
+     * @param[in]  perm Permutation vector
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuPermuteKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    PermutationVector _perm{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_PERMUTE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
new file mode 100644
index 0000000000..2c9627bdee
--- /dev/null
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuPool2dKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+using namespace misc::shape_calculator;
+
+static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = {
+    {"neon_qu8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)},
+    {"neon_qs8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)},
+    {"neon_f16_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)},
+    {"neon_fp32_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)},
+#if defined(ENABLE_NCHW_KERNELS)
+    {"neon_qu8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)},
+    {"neon_qs8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)},
+    {"neon_fp16_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)},
+    {"neon_fp16_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)},
+    {"neon_fp16_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)},
+    {"neon_fp32_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool7",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)},
+    {"neon_fp32_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)},
+#endif /* defined(ENABLE_NCHW_KERNELS) */
+};
+
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices,
+                          Size2D                  pool_size)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0);
+
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    int                 output_width    = 0;
+    int                 output_height   = 0;
+    PoolingType         pool_type       = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)),
+        "Pooling region that is entirely outside input tensor is unsupported for non-float types");
+
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(),
+                                 pool_size.y(), pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
+
+    TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    if (indices)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding &&
+            (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() &&
+            (src->data_layout() == DataLayout::NHWC),
+        "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+        if (indices)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices),
+                "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC),
+                                            "Pooling kernel indices only supported for NHWC");
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
+        }
+    }
+
+    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{
+        src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo            *src,
+                                                        ITensorInfo            *dst,
+                                                        ITensorInfo            *indices,
+                                                        const PoolingLayerInfo &pool_info,
+                                                        unsigned int           &num_elems_processed_per_iteration,
+                                                        int                     pool_size_x,
+                                                        int                     pool_size_y)
+{
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
+    if (indices)
+    {
+        // Indices auto inizialitation if not yet initialized
+        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)))
+                                         .set_data_type(DataType::U32) /* we store the offset to the element */);
+    }
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const bool         is_square           = pool_size_x == pool_size_y;
+    const unsigned int pooled_w            = dst->dimension(idx_width);
+    const unsigned int pooled_h            = dst->dimension(idx_height);
+
+    //If it's not squared and optimized will be executed the MxN
+    num_elems_processed_per_iteration = 1;
+
+    if (is_square)
+    {
+        switch (src->data_type())
+        {
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+                switch (pool_size_x)
+                {
+                    case 2:
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+                        break;
+                    case 3:
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+            case DataType::F16:
+                num_elems_processed_per_iteration = 1;
+                break;
+            case DataType::F32:
+                num_elems_processed_per_iteration = 1;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Element size not supported");
+                break;
+        }
+    }
+
+    bool   window_changed = false;
+    Window win{};
+    // Upper limit for the number of right/bottom border elements that are accessed
+    TensorShape dst_shape{src->tensor_shape()};
+    dst_shape.set(0, pooled_w);
+    dst_shape.set(1, pooled_h);
+    TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
+    win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+void CpuPool2dKernel::configure(ITensorInfo            *src,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info,
+                                ITensorInfo            *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
+    const bool          is_global_pooling = pool_info.is_global_pooling;
+
+    // Get data layout
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Update pool size in case of global pooling
+    const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
+
+    const auto *uk = CpuPool2dKernel::get_implementation(
+        PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first,
+                                    pool_size, CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+    // Set instance variables
+    _pool_info     = pool_info;
+    _data_layout   = src->data_layout();
+    _pool_size     = pool_size;
+    _pool_stride_x = pad_stride_info.stride().first;
+    _run_method    = uk->ukernel;
+    _name          = std::string("CpuPool2dKernel").append("/").append(uk->name);
+
+    if (_data_layout == DataLayout::NHWC)
+    {
+        // Configure kernel window
+        Window win = calculate_max_window(*dst, Steps());
+        ICpuKernel::configure(win);
+    }
+    else
+    {
+        // Configure kernel window
+        auto win_config = validate_and_configure_window(
+            src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y());
+        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+        ICpuKernel::configure(win_config.second);
+    }
+}
+
+Status CpuPool2dKernel::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info,
+                                 const ITensorInfo      *indices)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    const bool is_global_pooling = pool_info.is_global_pooling;
+
+    // Get data layout
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
+                                                              (indices) ? indices->clone().get() : nullptr, pool_info,
+                                                              num_elems_processed_per_iteration, pool_size_x,
+                                                              pool_size_y)
+                                    .first);
+
+    return Status{};
+}
+
+void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    ITensor       *dst     = tensors.get_tensor(TensorType::ACL_DST_0);
+    ITensor       *indices = tensors.get_tensor(TensorType::ACL_DST_1);
+
+    const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
+    const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
+    const unsigned int pool_size     = _pool_info.pool_size.width;
+
+    Window window_src(window);
+    if (_data_layout == DataLayout::NCHW)
+    {
+        // Set step for src in x and y direction for the src
+        unsigned int window_x_inc = 0;
+        switch (src->info()->data_type())
+        {
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            {
+                window_x_inc = pool_stride_x;
+                if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+                {
+                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2
+                                                        : _num_elems_processed_per_iteration;
+                }
+                break;
+            }
+
+            case DataType::F16:
+            case DataType::F32:
+            {
+                window_x_inc = pool_stride_x;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+        }
+        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x,
+                                                       window.x().end() * pool_stride_x, window_x_inc));
+        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y,
+                                                       window.y().end() * pool_stride_y, pool_stride_y));
+    }
+    else
+    {
+        window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
+        window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
+        window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
+    }
+    _run_method(src, dst, indices, _pool_info, window_src, window);
+}
+
+const char *CpuPool2dKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuPool2dKernel::PoolingKernel> &CpuPool2dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h
new file mode 100644
index 0000000000..859de8cc5f
--- /dev/null
+++ b/src/cpu/kernels/CpuPool2dKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel>
+{
+private:
+    using PoolingKernelPtr = std::add_pointer<void(
+        const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+
+public:
+    CpuPool2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @note F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in]  src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct PoolingKernel
+    {
+        const char                      *name;
+        const PoolDataTypeISASelectorPtr is_selected;
+        PoolingKernelPtr                 ukernel;
+    };
+
+    static const std::vector<PoolingKernel> &get_available_kernels();
+
+private:
+    PoolingLayerInfo _pool_info{};
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{0};
+    Size2D           _pool_size{};
+    int              _pool_stride_x{};
+    PoolingKernelPtr _run_method{nullptr};
+    std::string      _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
new file mode 100644
index 0000000000..8b484d4e0b
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuPool3dKernel.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/pool3d/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+using namespace misc::shape_calculator;
+
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = {
+    {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)},
+    {"neon_qs8_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)},
+    {"neon_fp16_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)},
+    {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}};
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
+                                    "Exclude padding is unsupported for non-float types for Avg op");
+
+    const auto data_layout = src->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+    const bool         is_global_pooling = pool_info.is_global_pooling;
+    const unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const unsigned int pool_size_z       = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+
+    const unsigned int stride_x = pool_info.stride.x();
+    const unsigned int stride_y = pool_info.stride.y();
+    const unsigned int stride_z = pool_info.stride.z();
+
+    ARM_COMPUTE_RETURN_ERROR_ON((pool_size_x == 0) || (pool_size_y == 0) || (pool_size_z == 0));
+    ARM_COMPUTE_RETURN_ERROR_ON((stride_x == 0) || (stride_y == 0) || (stride_z == 0));
+
+    int output_width  = 0;
+    int output_height = 0;
+    int output_depth  = 0;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        TensorInfo out_info(
+            TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+    }
+
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+} //namespace
+
+void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool3d_shape(src->tensor_shape(), pool_info)));
+
+    // Get data layout
+    const auto data_layout = src->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+    // Update pool size in case of global pooling
+    const bool   is_global_pooling = pool_info.is_global_pooling;
+    const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+                           is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+    // Set instance variables
+    _pool_info  = pool_info;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuPool3dKernel").append("/").append(uk->name);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuPool3dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info));
+
+    return Status{};
+}
+
+void CpuPool3dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    _run_method(src, dst, _pool_info, window);
+}
+
+const char *CpuPool3dKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
new file mode 100644
index 0000000000..bd1ff61046
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform Pooling 3D. */
+class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel>
+{
+private:
+    /* Template function for Pooling 3D NDHWC */
+    using Pooling3dKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+
+public:
+    CpuPool3dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3dKernel);
+    /** Set the src, dst tensor and pooling info.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     *
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool3dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct Pooling3dKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        Pooling3dKernelPtr           ukernel;
+    };
+
+    static const std::vector<Pooling3dKernel> &get_available_kernels();
+
+private:
+    Pooling3dLayerInfo _pool_info{};
+    Pooling3dKernelPtr _run_method{nullptr};
+    std::string        _name{};
+};
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
new file mode 100644
index 0000000000..ed4675ae3d
--- /dev/null
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuQuantizeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/quantize/generic/neon/list.h"
+
+#include <arm_neon.h>
+#include <map>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+
+    return Status{};
+}
+
+} // namespace
+
+void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = {
+        {"op_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_quantize_qasymm8)},
+        {"op_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_quantize_qasymm8)},
+        {"op_QASYMM8_QASYMM16", REGISTER_INTEGER_NEON(u8_run_quantize_qasymm16)},
+
+        {"op_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_quantize_qasymm8)},
+        {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_quantize_qasymm8)},
+        {"op_QASYMM8_SIGNED_QASYMM16", REGISTER_INTEGER_NEON(i8_run_quantize_qasymm16)},
+
+        // Functions for offset only requantization
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_requantize_offset_only)},
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only)},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only)},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_requantize_offset_only)},
+
+        // Functions for offset uint8 to int8 and vice versa quantization (no scale changes)
+        {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8",
+         REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only_convert)},
+        {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED",
+         REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only_convert)},
+
+        {"op_F32_QSYMM8", REGISTER_FP32_NEON(fp32_i8_run_quantize_qsymm8)},
+        {"op_F32_QASYMM8", REGISTER_FP32_NEON(fp32_u8_run_quantize_qasymm8)},
+        {"op_F32_QASYMM8_SIGNED", REGISTER_FP32_NEON(fp32_i8_run_quantize_qasymm8)},
+        {"op_F32_QASYMM16", REGISTER_FP32_NEON(fp32_run_quantize_qasymm16)},
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        {"op_F16_QASYMM8", REGISTER_FP16_NEON(fp16_u8_run_quantize_qasymm8)},
+        {"op_F16_QASYMM8_SIGNED", REGISTER_FP16_NEON(fp16_i8_run_quantize_qasymm8)},
+        {"op_F16_QASYMM16", REGISTER_FP16_NEON(fp16_run_quantize_qasymm16)},
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+    };
+
+    std::string function_to_call("op_");
+
+    // For offset only functions - must be 8-bit and have identical scale values.
+    if (src->quantization_info().scale() == dst->quantization_info().scale() &&
+        (is_data_type_quantized_asymmetric_char(src->data_type()) &&
+         is_data_type_quantized_asymmetric_char(dst->data_type())))
+    {
+        function_to_call += "OFFSET_ONLY_";
+        // For optimized datatype conversion 8-bit re-quantization offset only functions.
+        // These must have an offset of exactly 128 to match requirements - has specific circumstances to match use case.
+        auto uqinfo =
+            compute_requantization_scale_offset(src->quantization_info().uniform(), dst->quantization_info().uniform());
+        const auto src_dt = src->data_type();
+        if (src->data_type() != dst->data_type() && ((src_dt == DataType::QASYMM8_SIGNED && uqinfo.offset == 128) ||
+                                                     (src_dt == DataType::QASYMM8 && uqinfo.offset == -128)))
+        {
+            function_to_call += "CONVERT_";
+        }
+    }
+
+    // Specify datatype for function
+    function_to_call += string_from_data_type(src->data_type()) + "_";
+    function_to_call += string_from_data_type(dst->data_type());
+
+    auto it = quant_map.find(function_to_call);
+
+    if (it == quant_map.end())
+    {
+        ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
+    }
+    _func = it->second;
+
+    // Calculate window. Squash if possible.
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+    (*_func)(src, dst, window);
+}
+
+const char *CpuQuantizeKernel::name() const
+{
+    return "CpuQuantizeKernel";
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
new file mode 100644
index 0000000000..750310c811
--- /dev/null
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors
+ */
+class CpuQuantizeKernel : public ICpuKernel<CpuQuantizeKernel>
+{
+public:
+    CpuQuantizeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel);
+    /** Set the input, output.
+     *
+     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuQuantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+    *
+    * @return The split dimension hint.
+    */
+    size_t get_split_dimension_hint() const
+    {
+        return _split_dimension;
+    }
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Common signature for all the specialised @ref CpuQuantizeKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeFunctionExecutorPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window);
+    QuantizeFunctionExecutorPtr _func{nullptr};
+    size_t                      _split_dimension{Window::DimY};
+};
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
new file mode 100644
index 0000000000..241e58fbce
--- /dev/null
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuReshapeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+/** [NEReshapeLayerKernel Kernel] **/
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    if (dst->tensor_shape().total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
+    }
+
+    return Status{};
+}
+
+template <typename T>
+void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
+{
+    const TensorShape &src_shape = src->info()->tensor_shape();
+    const TensorShape &dst_shape = dst->info()->tensor_shape();
+
+    Iterator dst_it(dst, window);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &dst_coord)
+        {
+            Coordinates src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+            const auto  output_ptr = dst->ptr_to_element(dst_coord);
+            const auto  input_ptr  = src->ptr_to_element(src_coord);
+
+            *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+        },
+        dst_it);
+}
+
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst)
+{
+    switch (src->info()->data_type())
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            reshape_tensor_per_element<uint8_t>(window, src, dst);
+            break;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::F16:
+            reshape_tensor_per_element<uint16_t>(window, src, dst);
+            break;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            reshape_tensor_per_element<uint32_t>(window, src, dst);
+            break;
+        case DataType::U64:
+        case DataType::S64:
+        case DataType::F64:
+            reshape_tensor_per_element<uint64_t>(window, src, dst);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+}
+
+void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *dst)
+{
+    const TensorShape &src_shape = src->info()->tensor_shape();
+    const TensorShape &dst_shape = dst->info()->tensor_shape();
+    Coordinates        src_coord{};
+    Coordinates        dst_coord{};
+
+    const auto element_size      = dst->info()->element_size();
+    const auto window_start_x    = static_cast<int>(window.x().start());
+    const auto window_end_x      = static_cast<int>(window.x().end());
+    const auto src_row_size      = static_cast<int>(src_shape[0]);
+    const auto row_size_in_bytes = src_row_size * element_size;
+
+    auto output_ptr = dst->ptr_to_element(dst_coord);
+    auto input_ptr  = src->ptr_to_element(src_coord);
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator dst_it(dst, win);
+    execute_window_loop(
+        win,
+        [&](Coordinates &id)
+        {
+            dst_coord = id;
+
+            for (int x = window_start_x; x < window_end_x; x += src_row_size)
+            {
+                src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+                output_ptr = dst->ptr_to_element(dst_coord);
+                input_ptr  = src->ptr_to_element(src_coord);
+
+                std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+                dst_coord.increment(Window::DimX, src_row_size);
+            }
+        },
+        dst_it);
+}
+
+void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
+{
+    Iterator src_it(src, window);
+    Iterator dst_it(dst, window);
+
+    const size_t element_size         = dst->info()->element_size();
+    const auto   window_size          = window.x().end() - window.x().start();
+    const auto   window_size_in_bytes = window_size * element_size;
+
+    const auto input_ptr  = src_it.ptr();
+    const auto output_ptr = dst_it.ptr();
+
+    std::memcpy(output_ptr, input_ptr, window_size_in_bytes);
+}
+} // namespace
+
+void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+    ARM_COMPUTE_UNUSED(src);
+
+    _reshape_tensor_fn = reshape_tensor_per_element_selector;
+    // Configure kernel window
+    Window win = calculate_max_window(*dst);
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+    _reshape_tensor_fn(window, src, dst);
+}
+
+const char *CpuReshapeKernel::name() const
+{
+    return "CpuReshapeKernel";
+}
+
+size_t CpuReshapeKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+
+    return ICPPKernel::default_mws;
+}
+
+void CpuReshapeKernel::prepare(ITensorPack &tensors)
+{
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const ITensorInfo *src_info = src->info();
+    const ITensorInfo *dst_info = dst->info();
+
+    // Calculate kernel window based on the padding info
+    Window win;
+
+    const bool src_has_holes      = has_holes(*src_info, src_info->num_dimensions() - 1);
+    const bool dst_has_holes      = has_holes(*dst_info, dst_info->num_dimensions() - 1);
+    const bool src_has_holes_in_x = has_holes(*src_info, Window::DimX);
+    const bool dst_has_holes_in_x = has_holes(*dst_info, Window::DimX);
+    const auto src_row_size       = static_cast<int>(src_info->tensor_shape()[0]);
+    const auto dst_row_size       = static_cast<int>(dst_info->tensor_shape()[0]);
+
+    if (!src_has_holes && !dst_has_holes)
+    {
+        std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
+        /*
+            Copy the tensor per window. If the src and dst tensors
+            are contiguous memory allocations without any holes or
+            padding, then the tensor is squashed to 1D window and
+            we can use use a single memcopy call to copy the whole
+            window in reshape_tensor_per_window fn
+        */
+        _reshape_tensor_fn = reshape_tensor_per_window;
+    }
+    else
+    {
+        win = calculate_max_window(*dst_info);
+        /*
+            Copy tensor row by row if src and dst have no holes in X
+            dim and they have the same number of elements in their rows
+        */
+        if (!src_has_holes_in_x && !dst_has_holes_in_x && (src_row_size == dst_row_size))
+        {
+            _reshape_tensor_fn = reshape_tensor_per_row;
+        }
+        else
+        {
+            /*
+                Fall back to the element wise copy
+            */
+            _reshape_tensor_fn = reshape_tensor_per_element_selector;
+        }
+    }
+
+    ICPPKernel::configure(win);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+/** [NEReshapeLayerKernel Kernel] **/
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
new file mode 100644
index 0000000000..ce566fd9e2
--- /dev/null
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
+#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform tensor reshaping */
+class CpuReshapeKernel : public ICpuKernel<CpuReshapeKernel>
+{
+public:
+    CpuReshapeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuReshapeKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]  src Source tensor info. Data type supported: All
+     * @param[out] dst Destination tensor info. Data type supported: Same as @p input
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuReshapeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
+     *
+     * @param[in] tensors Pack of input and output tensors
+     *
+     */
+    void prepare(ITensorPack &tensors);
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+      *
+      * @return The split dimension.
+      */
+    size_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
+private:
+    size_t _split_dimension{Window::DimY};
+
+    std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp
new file mode 100644
index 0000000000..7cf8916e9b
--- /dev/null
+++ b/src/cpu/kernels/CpuScaleKernel.cpp
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuScaleKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/scale/neon/list.h"
+#include "src/cpu/kernels/scale/sve/list.h"
+#include "support/Rounding.h"
+
+#include <arm_neon.h>
+#include <map>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#ifdef ENABLE_NCHW_KERNELS
+void scale_area_nchw_u8(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, offsets, policy, border_mode, constant_border_value, sampling_offset, align_corners);
+    using namespace scale_helpers;
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
+
+    // Don't increment in width/height/channels for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const auto   w         = src->info()->dimension(0);
+    const auto   h         = src->info()->dimension(1);
+    const size_t in_stride = src->info()->strides_in_bytes()[1];
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
+
+            uint8x8_t tmp0 = vdup_n_u8(0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
+
+            uint8x8_t tmp1 = vdup_n_u8(0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
+
+            vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
+        },
+        src_i, dst_i);
+}
+
+template <typename T>
+void scale_bilinear_qasymm_nchw(const ITensor      *src,
+                                ITensor            *dst,
+                                const ITensor      *offsets,
+                                const ITensor      *dx,
+                                const ITensor      *dy,
+                                InterpolationPolicy policy,
+                                BorderMode          border_mode,
+                                PixelValue          constant_border_value,
+                                float               sampling_offset,
+                                bool                align_corners,
+                                const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    // Get data layout and width/height indices
+    const int idx_width  = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::HEIGHT);
+
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height),
+                                                        dst->info()->dimension(idx_height), align_corners);
+    Window     win_off;
+    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(idx_width, Window::Dimension(0, 0, 0));
+    win_in.set(idx_height, Window::Dimension(0, 0, 0));
+
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+
+    const int32_t in_dim_w = src->info()->dimension(idx_width);
+    const int32_t in_dim_h = src->info()->dimension(idx_height);
+    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
+    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
+
+    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
+
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int     index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
+                const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
+                const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
+                const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
+
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+/** function to perform scale using bilinear interpolation on the given window */
+template <typename T>
+void scale_bilinear_nchw(const ITensor      *src,
+                         ITensor            *dst,
+                         const ITensor      *offsets,
+                         const ITensor      *dx,
+                         const ITensor      *dy,
+                         InterpolationPolicy policy,
+                         BorderMode          border_mode,
+                         PixelValue          constant_border_value,
+                         float               sampling_offset,
+                         bool                align_corners,
+                         const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    arm_compute::cpu::scale_bilinear_nchw<T>(src, dst, dx, dy, offsets, border_mode, constant_border_value,
+                                             sampling_offset, align_corners, window);
+}
+
+template <typename T>
+void scale_nearest_nchw(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy, border_mode);
+    arm_compute::cpu::scale_nearest_nchw<T>(src, dst, dx, dy, offsets, constant_border_value, sampling_offset,
+                                            align_corners, window);
+}
+
+#endif // ENABLE_NCHW_KERNELS
+
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = {
+    {"sve_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)},
+    {"sve_fp32_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)},
+    {"sve_qu8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)},
+    {"sve_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)},
+    {"sve_u8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)},
+    {"sve_s16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)},
+    {"neon_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_common_neon_scale)},
+    {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)},
+    {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)},
+    {"neon_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)},
+    {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)},
+    {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)},
+    {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)},
+};
+
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *dx,
+                          const ITensorInfo     *dy,
+                          const ITensorInfo     *offsets,
+                          ITensorInfo           *dst,
+                          const ScaleKernelInfo &info)
+{
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
+
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_UNUSED(info.constant_border_value);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
+
+    const DataLayout data_layout   = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+    const auto       width_index   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const auto       height_index  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const auto       output_width  = dst->dimension(width_index);
+    const auto       output_height = dst->dimension(height_index);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) &&
+                                (data_layout != DataLayout::NHWC ||
+                                 info.interpolation_policy != InterpolationPolicy::BILINEAR ||
+                                 info.border_mode != BorderMode::REPLICATE));
+
+    if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+    }
+
+    if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+        if (dx != nullptr && dy != nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
+        }
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners &&
+                                !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+
+    if (info.interpolation_policy == InterpolationPolicy::AREA)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuScaleKernel::configure(const ITensorInfo     *src,
+                               const ITensorInfo     *dx,
+                               const ITensorInfo     *dy,
+                               const ITensorInfo     *offsets,
+                               ITensorInfo           *dst,
+                               const ScaleKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, offsets);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info));
+
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuScaleKernel")
+                .append("/")
+                .append(uk->name)
+                .append("_")
+                .append(string_from_interpolation_policy(info.interpolation_policy));
+
+    // Get data layout and width/height indices
+    _data_layout         = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+    _policy                = info.interpolation_policy;
+    _border_mode           = info.border_mode;
+    _constant_border_value = info.constant_border_value;
+    _align_corners         = info.align_corners;
+
+    if (info.sampling_policy == SamplingPolicy::CENTER)
+    {
+        _sampling_offset = 0.5f;
+    }
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR
+                                                                               : _policy;
+
+    if (_border_mode == BorderMode::UNDEFINED)
+    {
+        _border_mode           = BorderMode::CONSTANT;
+        _constant_border_value = PixelValue();
+    }
+
+#ifdef ENABLE_NCHW_KERNELS
+    // Configure scale function to run
+    if (_data_layout == DataLayout::NCHW)
+    {
+        std::string function_to_call("scale_");
+        function_to_call += string_from_data_type(src->data_type()) + "_";
+        function_to_call += string_from_data_layout(_data_layout) + "_";
+        function_to_call += string_from_interpolation_policy(_policy);
+
+        const static std::map<std::string, ScaleKernelPtr> map_nchw_function = {
+            {"scale_U8_NCHW_AREA_CONSTANT", &arm_compute::cpu::scale_area_nchw_u8},
+            {"scale_U8_NCHW_AREA_CONSTANT", &arm_compute::cpu::scale_area_nchw_u8},
+            {"scale_U8_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<uint8_t>},
+            {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<uint8_t>},
+            {"scale_QASYMM8_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_qasymm_nchw<uint8_t>},
+            {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<uint8_t>},
+            {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_qasymm_nchw<int8_t>},
+            {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<int8_t>},
+            {"scale_S16_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<int16_t>},
+            {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<int16_t>},
+            {"scale_F16_NCHW_BILINEAR", REGISTER_FP16_NEON(arm_compute::cpu::fp16_bilinear_neon_scale_nchw)},
+            {"scale_F16_NCHW_NEAREST_NEIGHBOUR", REGISTER_FP16_NEON(arm_compute::cpu::fp16_nearest_neon_scale_nchw)},
+            {"scale_F32_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<float>},
+            {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<float>},
+        };
+        auto it = map_nchw_function.find(function_to_call);
+        if (it != map_nchw_function.end())
+        {
+            _nchw_func = it->second;
+        }
+    }
+#endif // ENABLE_NCHW_KERNELS
+
+    // Configure window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuScaleKernel::validate(const ITensorInfo     *input,
+                                const ITensorInfo     *dx,
+                                const ITensorInfo     *dy,
+                                const ITensorInfo     *offsets,
+                                ITensorInfo           *output,
+                                const ScaleKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
+    return Status{};
+}
+
+void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_nchw_func == nullptr && _data_layout == DataLayout::NCHW);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC);
+
+    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
+    const auto dx      = tensors.get_const_tensor(TensorType::ACL_INT_0);
+    const auto dy      = tensors.get_const_tensor(TensorType::ACL_INT_1);
+    const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
+
+    if (_data_layout == DataLayout::NCHW)
+    {
+        _nchw_func(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset,
+                   _align_corners, window);
+    }
+    else
+    {
+        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset,
+                    _align_corners, window);
+    }
+}
+
+const char *CpuScaleKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuScaleKernel::ScaleKernel> &CpuScaleKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h
new file mode 100644
index 0000000000..f2cad5e899
--- /dev/null
+++ b/src/cpu/kernels/CpuScaleKernel.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Arm(R) Neon(TM) kernel to perform scaling on a tensor */
+class CpuScaleKernel : public ICpuKernel<CpuScaleKernel>
+{
+private:
+    /** Scale function to use for the particular function to use */
+    using ScaleKernelPtr = std::add_pointer<void(const ITensor *,
+                                                 ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 InterpolationPolicy,
+                                                 BorderMode,
+                                                 PixelValue,
+                                                 float,
+                                                 bool,
+                                                 const Window &)>::type;
+
+public:
+    CpuScaleKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel);
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
+     * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear
+     *
+     * @param[in]  src     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32.
+     * @param[in]  dx      Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in]  dy      Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in]  offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[out] dst     Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info    @ref ScaleKernelInfo to use for configuration
+     */
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *dx,
+                   const ITensorInfo     *dy,
+                   const ITensorInfo     *offsets,
+                   ITensorInfo           *dst,
+                   const ScaleKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuScaleKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *dx,
+                           const ITensorInfo     *dy,
+                           const ITensorInfo     *offsets,
+                           ITensorInfo           *dst,
+                           const ScaleKernelInfo &info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct ScaleKernel
+    {
+        const char                                 *name;
+        const ScaleKernelDataTypeISASelectorDataPtr is_selected;
+        ScaleKernelPtr                              ukernel;
+    };
+
+    static const std::vector<ScaleKernel> &get_available_kernels();
+
+private:
+    ScaleKernelPtr      _nchw_func{nullptr};
+    InterpolationPolicy _policy{};
+    BorderMode          _border_mode{};
+    PixelValue          _constant_border_value{};
+    float               _sampling_offset{0};
+    bool                _align_corners{false};
+    DataLayout          _data_layout{DataLayout::UNKNOWN};
+    ScaleKernelPtr      _run_method{nullptr};
+    std::string         _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
new file mode 100644
index 0000000000..b7e395fb79
--- /dev/null
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuSoftmaxKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/softmax/list.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+
+/* Softmax */
+static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+    {"sme2_fp32_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F32 && data.isa.sme2 && data.axis == 0); },
+     REGISTER_FP32_SME2(sme2_fp32_softmax)},
+    {"neon_fp32_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<false>)},
+    {"sme2_fp16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F16 && data.isa.sme2 && data.axis == 0); },
+     REGISTER_FP16_SME2(sme2_fp16_softmax)},
+    {"neon_fp16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<false>)},
+    {"sme2_qu8_softmax_lut_512VL",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     {
+         return (!data.is_log && data.dt == DataType::QASYMM8 && data.isa.sme2 && data.axis == 0 &&
+                 data.sme2_vector_length == 512);
+     },
+     REGISTER_QASYMM8_SME2(sme2_qasymm8_softmax_lut_512VL)},
+    {"neon_qu8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)},
+    {"sme2_qs8_softmax_lut_512VL",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     {
+         return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED && data.isa.sme2 && data.axis == 0 &&
+                 data.sme2_vector_length == 512);
+     },
+     REGISTER_QASYMM8_SIGNED_SME2(sme2_qasymm8_signed_softmax_lut_512VL)},
+    {"neon_qs8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)},
+    {"neon_fp32_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<true>)},
+    {"neon_fp16_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<true>)},
+    {"neon_qu8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)},
+    {"neon_qs8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
+};
+
+void init_lut(std::vector<float> &lut, DataType type, float scale, float beta)
+{
+    if (type == DataType::QASYMM8)
+    {
+        for (int i = 0; i < 256; ++i)
+        {
+            lut.push_back(std::exp(-scale * beta * i));
+        }
+    }
+    else if (type == DataType::QASYMM8_SIGNED)
+    {
+        for (int i = -128; i < 128; ++i)
+        {
+            lut.push_back(std::exp(-scale * beta * i));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Invalid datatype for QASYMM8/QASYMM8_SIGNED softmax");
+    }
+}
+
+Status validate_arguments_softmax(
+    const ITensorInfo &src, const ITensorInfo &dst, float beta, int axis, const ITensorInfo &tmp, bool is_log)
+{
+    ARM_COMPUTE_UNUSED(beta);
+    // Check input
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < 0 || axis > 3);
+
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
+
+    // Check output if configured
+    if (dst.total_size() != 0)
+    {
+        const QuantizationInfo output_quantization =
+            is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log)
+                                    : dst.quantization_info();
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
+    }
+
+    // Check tmp if configured
+    if (tmp.total_size() != 0)
+    {
+        // We have temporary storage only if src data type is quantized.
+        // Therefore, tmp data type must be F32
+        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric);
+
+        // We could potentially reduce tmp memory if we could predict or make an assumption
+        // on the maximum number of threads that will run in parallel.
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
+    }
+
+    return Status{};
+}
+} // namespace
+
+const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+void CpuSoftmaxKernel::configure(
+    const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, int axis, ITensorInfo *tmp)
+{
+    _axis = axis;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, axis, *tmp, is_log));
+
+    // Configure kernel window
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
+
+    // Output auto initialization if not yet initialized
+    const QuantizationInfo output_quantization =
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log)
+                                : dst->quantization_info();
+    auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
+
+    // Tmp auto initialization if not yet initialized and src is quantized
+    if (is_quantized_asymmetric)
+    {
+        auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(DataType::F32).reset_padding());
+    }
+
+    const auto *uk = CpuSoftmaxKernel::get_implementation(SoftmaxKernelDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_isa(), is_log, axis, CPUInfo::get().get_sme2_vector_length()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
+
+    _beta       = beta;
+    _run_method = uk->ukernel;
+    _name       = kernel_name.append("/").append(uk->name);
+
+    Window win;
+
+    int vec_size = 16 / dst->element_size();
+
+    if (_axis == 0)
+    {
+        win = calculate_max_window(*dst, Steps());
+
+        /// TODO:Check dimensions > 0 for holes only. For this, we need
+        /// a utility function checking if there are holes after some dimension.
+        if (!has_holes(*dst, dst->num_dimensions() - 1))
+        {
+            win = win.collapse(win, Window::DimY);
+        }
+    }
+    else if (_axis > 0 && _axis <= 3)
+    {
+        win = calculate_max_window(*dst, Steps(vec_size));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Invalid axis");
+    }
+
+    win.set(_axis, Window::Dimension(0, 1, 1));
+
+    ICpuKernel<CpuSoftmaxKernel>::configure(win);
+
+    const std::string uk_name = uk->name;
+    if (uk_name == "sme2_qu8_softmax_lut_512VL" || uk_name == "sme2_qs8_softmax_lut_512VL")
+    {
+        const float scale = src->quantization_info().uniform().scale;
+        init_lut(_lut, src->data_type(), scale, beta);
+    }
+}
+
+Status CpuSoftmaxKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *dst, float beta, int axis, bool is_log, const ITensorInfo *tmp)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, axis, *tmp, is_log));
+
+    return Status{};
+}
+
+void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        auto         tmp = tensors.get_tensor(TensorType::ACL_DST_1);
+        unsigned int num_elems_processed_per_iteration;
+        if (_axis == 0)
+        {
+            num_elems_processed_per_iteration = src->info()->valid_region().shape[_axis];
+        }
+        else
+        {
+            //16 QASYMM8/QASYMM8_SIGNED elements can fit into the 16-byte vectors.
+            num_elems_processed_per_iteration = 16;
+        }
+        const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
+
+        void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
+        _run_method(src, tmp_for_thread, dst, _beta, _axis, window, _lut.data());
+    }
+    else
+    {
+        _run_method(src, nullptr, dst, _beta, _axis, window, nullptr);
+    }
+}
+
+const char *CpuSoftmaxKernel::name() const
+{
+    return _name.c_str();
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
new file mode 100644
index 0000000000..676e79782b
--- /dev/null
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for softmax computation */
+class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel>
+{
+private:
+    using SoftmaxKernelPtr = std::add_pointer<void(
+        const ITensor *, void *const, ITensor *, float, int, const Window &, const float *)>::type;
+
+public:
+    CpuSoftmaxKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel);
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst    Destination tensor info. Data types supported: same as @p input.
+     * @param[in]  beta   A scaling factor for the exponent.
+     * @param[in]  is_log True if the operation is log-softmax.
+     * @param[in]  axis   The axis along which to perform the softmax operation.
+     *
+     * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, int axis, ITensorInfo *tmp);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuSoftmaxKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int axis, bool is_log, const ITensorInfo *tmp);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct SoftmaxKernel
+    {
+        const char                                   *name;
+        const SoftmaxKernelDataTypeISASelectorDataPtr is_selected;
+        SoftmaxKernelPtr                              ukernel;
+    };
+
+    static const std::vector<SoftmaxKernel> &get_available_kernels();
+
+private:
+    float              _beta{1.0f};
+    SoftmaxKernelPtr   _run_method{nullptr};
+    std::string        _name{};
+    int                _axis{};
+    std::vector<float> _lut = {};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
new file mode 100644
index 0000000000..c8706ff651
--- /dev/null
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuSubKernel.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+#include "src/cpu/kernels/sub/neon/impl.h"
+#include "src/cpu/kernels/sub/neon/list.h"
+
+#if defined(ENABLE_FP32_KERNELS)
+namespace
+{
+static constexpr size_t default_mws_N1_fp32_neon = 24385;
+static constexpr size_t default_mws_V1_fp32_neon = 40520;
+} // namespace
+#endif /* ENABLE_FP32_KERNELS */
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+using CpuSubKernelDataTypeISASelectorData    = CpuAddKernelDataTypeISASelectorData;
+using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
+
+static const std::vector<CpuSubKernel::SubKernel> available_kernels = {
+    {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)},
+    {"neon_fp16_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon_fp16)},
+    {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)},
+    {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)},
+    {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)},
+    {"neon_qu8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)},
+    {"neon_qs8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)},
+    {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)},
+    {"neon_qs8_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)},
+    {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)},
+};
+
+inline Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
+
+    const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src0.data_type()) && (policy == ConvertPolicy::WRAP),
+                                    "Convert policy cannot be WRAP if datatype is quantized");
+
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+    }
+    return Status{};
+}
+} // namespace
+
+void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape());
+
+    // Auto initialize dst if not initialized
+    set_shape_if_empty(*dst, out_shape);
+    set_data_type_if_unknown(*dst, src0->data_type());
+
+    const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst);
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+    _policy     = policy;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuSubKernel").append("/").append(uk->name);
+
+    // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src0, *src1);
+
+    ICpuKernel::configure(win);
+}
+
+size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if (this->_run_method == &sub_same_neon<float>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if (platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_mws_N1_fp32_neon;
+        }
+        else if (platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if (this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else  /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
+Status
+CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
+
+    return Status{};
+}
+
+void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    _run_method(src0, src1, dst, _policy, window);
+}
+
+const char *CpuSubKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuSubKernel::SubKernel> &CpuSubKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
new file mode 100644
index 0000000000..5fa0dc411a
--- /dev/null
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H
+#define ARM_COMPUTE_CPU_SUB_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform subtraction between two tensors */
+class CpuSubKernel : public ICpuKernel<CpuSubKernel>
+{
+private:
+    using SubKernelPtr                           = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
+
+public:
+    CpuSubKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel);
+
+    /** Initialise the kernel's src and dst.
+     *
+     * Valid configurations (src0,src1) -> dst :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
+     *
+     * @param[in]  src0   An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  src1   An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] dst    The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
+     * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuSubKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    struct SubKernel
+    {
+        const char                                  *name;
+        const CpuSubKernelDataTypeISASelectorDataPtr is_selected;
+        SubKernelPtr                                 ukernel;
+    };
+
+    static const std::vector<SubKernel> &get_available_kernels();
+
+    size_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
+private:
+    ConvertPolicy _policy{};
+    SubKernelPtr  _run_method{nullptr};
+    std::string   _name{};
+    size_t        _split_dimension{Window::DimY};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */
diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp
new file mode 100644
index 0000000000..0f762ba041
--- /dev/null
+++ b/src/cpu/kernels/CpuTransposeKernel.cpp
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuTransposeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+unsigned int num_elems_processed(size_t element_size)
+{
+    switch (element_size)
+    {
+        case 1:
+            return 8;
+        case 2:
+            return 4;
+        case 4:
+#ifdef __aarch64__
+            return 8;
+#else  // __aarch64__
+            return 4;
+#endif // __aarch64__
+        default:
+            break;
+    }
+
+    ARM_COMPUTE_ERROR("Element size not supported");
+}
+
+void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    const int    window_step_x            = 8;
+    const int    window_step_y            = 8;
+    const int    window_start_x           = window.x().start();
+    const int    window_end_x             = window.x().end();
+    const int    window_start_y           = window.y().start();
+    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+
+    // Check if we need a left-over loop for the y dimension
+    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
+
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+    if (left_over_loop_y)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if (window_end_y_multiple_of > window_start_y)
+        {
+            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
+        }
+        else
+        {
+            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
+        }
+    }
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator output(out, window_out);
+
+    // Run the SIMD path if and only if the input is not a row-vector
+    if (in->info()->dimension(1) != 1)
+    {
+        Iterator input(in, window_in);
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x8_t row0 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
+                    const uint8x8_t row1 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
+                    const uint8x8_t row2 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
+                    const uint8x8_t row3 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
+                    const uint8x8_t row4 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
+                    const uint8x8_t row5 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
+                    const uint8x8_t row6 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
+                    const uint8x8_t row7 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
+
+                    // Transpose 2x2
+                    const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+                    const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+                    const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+                    const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+
+                    // Transpose 4x4
+                    const uint16x4x2_t k0_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+                    const uint16x4x2_t k1_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+                    const uint16x4x2_t k2_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+                    const uint16x4x2_t k3_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+
+                    // Transpose 8x8
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+                    const uint32x2x2_t k2_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+                    const uint32x2x2_t k3_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+                }
+
+                // Compute left-over elements along the x dimension (1x8)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
+                    const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
+                    const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
+                    const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
+                    const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
+                    const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
+                    const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
+                    const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
+
+                    uint8x8_t result = vdup_n_u8(0);
+                    result           = vset_lane_u8(val0, result, 0);
+                    result           = vset_lane_u8(val1, result, 1);
+                    result           = vset_lane_u8(val2, result, 2);
+                    result           = vset_lane_u8(val3, result, 3);
+                    result           = vset_lane_u8(val4, result, 4);
+                    result           = vset_lane_u8(val5, result, 5);
+                    result           = vset_lane_u8(val6, result, 6);
+                    result           = vset_lane_u8(val7, result, 7);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+                    vst1_u8(output.ptr() + dst_offset_in_bytes, result);
+                }
+            },
+            input, output);
+    }
+
+    if (left_over_loop_y)
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
+        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
+
+        Iterator input(in, window_in);
+        Iterator output(out, window_out);
+
+        // Compute left-over elements along the y dimension (1x1)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint8_t val0 = *input.ptr();
+
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+
+                *(output.ptr() + dst_offset_in_bytes) = val0;
+            },
+            input, output);
+    }
+}
+
+void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    const int    window_step_x            = 4;
+    const int    window_step_y            = 4;
+    const int    window_start_x           = window.x().start();
+    const int    window_end_x             = window.x().end();
+    const int    window_start_y           = window.y().start();
+    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+
+    // Check if we need a left-over loop for the y dimension
+    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
+
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+    if (left_over_loop_y)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if (window_end_y_multiple_of > window_start_y)
+        {
+            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
+        }
+        else
+        {
+            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
+        }
+    }
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator output(out, window_out);
+
+    // Run the SIMD path if and only if the input is not a row-vector
+    if (in->info()->dimension(1) != 1)
+    {
+        Iterator input(in, window_in);
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint16x4_t row0 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16x4_t row1 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16x4_t row2 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16x4_t row3 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x2
+                    const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+                    const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+
+                    // Transpose 4x4
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[1]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[1]));
+                }
+
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    uint16x4_t result = vdup_n_u16(0);
+                    result            = vset_lane_u16(val0, result, 0);
+                    result            = vset_lane_u16(val1, result, 1);
+                    result            = vset_lane_u16(val2, result, 2);
+                    result            = vset_lane_u16(val3, result, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+                    vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
+    }
+
+    if (left_over_loop_y)
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
+        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
+
+        Iterator input(in, window_in);
+        Iterator output(out, window_out);
+
+        // Compute left-over elements along the y dimension (1x1)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
+
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+
+                *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
+    }
+}
+
+#ifdef __aarch64__
+inline uint32x4x2_t vld1q_u32_x2_(const uint32_t *ptr)
+{
+    // gcc-7 doesn't support vld1q_u32_x2 instruction
+    return {vld1q_u32(ptr), vld1q_u32(ptr + 4)};
+}
+
+inline void vst1q_u32_x2_(const uint32_t *ptr, const uint32x4x2_t &val)
+{
+    // gcc-7 doesn't support vst1q_u32_x2 instruction
+    vst1q_u32(const_cast<uint32_t *>(ptr), val.val[0]);
+    vst1q_u32(const_cast<uint32_t *>(ptr + 4), val.val[1]);
+}
+
+void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    constexpr int window_step_x            = 8;
+    constexpr int window_step_y            = 8;
+    const int     window_start_x           = window.x().start();
+    const int     window_end_x             = window.x().end();
+    const int     window_start_y           = window.y().start();
+    const int     window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int     window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t  input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
+    const size_t  output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+
+    // Check if we need a left-over loop for the y dimension
+    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
+
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+    if (left_over_loop_y)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if (window_end_y_multiple_of > window_start_y)
+        {
+            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
+        }
+        else
+        {
+            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
+        }
+    }
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator output(out, window_out);
+
+    // Run the SIMD path if and only if the input is not a row-vector
+    if (in->info()->dimension(1) != 1)
+    {
+        Iterator input(in, window_in);
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load
+                    const uint32x4x2_t row0 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row1 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row2 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row3 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row4 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row5 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row6 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row7 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x4
+                    const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]),
+                                                 vtrn2q_u32(row0.val[0], row1.val[0])};
+                    const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]),
+                                                 vtrn2q_u32(row0.val[1], row1.val[1])};
+                    const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]),
+                                                 vtrn2q_u32(row2.val[0], row3.val[0])};
+                    const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]),
+                                                 vtrn2q_u32(row2.val[1], row3.val[1])};
+                    const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]),
+                                                 vtrn2q_u32(row4.val[0], row5.val[0])};
+                    const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]),
+                                                 vtrn2q_u32(row4.val[1], row5.val[1])};
+                    const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]),
+                                                 vtrn2q_u32(row6.val[0], row7.val[0])};
+                    const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]),
+                                                 vtrn2q_u32(row6.val[1], row7.val[1])};
+
+                    // Transpose 2x2
+                    const uint64x2x2_t k0_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
+                    const uint64x2x2_t k1_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
+                    const uint64x2x2_t k2_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
+                    const uint64x2x2_t k3_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
+                    const uint64x2x2_t k4_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
+                    const uint64x2x2_t k5_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
+                    const uint64x2x2_t k6_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
+                    const uint64x2x2_t k7_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
+
+                    // Swap blocks
+                    const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]),
+                                               vreinterpretq_u32_u64(k4_u64.val[0])};
+                    const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]),
+                                               vreinterpretq_u32_u64(k5_u64.val[0])};
+                    const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]),
+                                               vreinterpretq_u32_u64(k4_u64.val[1])};
+                    const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]),
+                                               vreinterpretq_u32_u64(k5_u64.val[1])};
+                    const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]),
+                                               vreinterpretq_u32_u64(k6_u64.val[0])};
+                    const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]),
+                                               vreinterpretq_u32_u64(k7_u64.val[0])};
+                    const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]),
+                                               vreinterpretq_u32_u64(k6_u64.val[1])};
+                    const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]),
+                                               vreinterpretq_u32_u64(k7_u64.val[1])};
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    // Store
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        col0);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        col1);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        col2);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        col3);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        col4);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        col5);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        col6);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        col7);
+                }
+
+                // Compute left-over elements (8x1)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+                    uint32x4_t result0 = vdupq_n_u32(0);
+                    uint32x4_t result1 = vdupq_n_u32(0);
+                    result0            = vsetq_lane_u32(val0, result0, 0);
+                    result0            = vsetq_lane_u32(val1, result0, 1);
+                    result0            = vsetq_lane_u32(val2, result0, 2);
+                    result0            = vsetq_lane_u32(val3, result0, 3);
+                    result1            = vsetq_lane_u32(val4, result1, 0);
+                    result1            = vsetq_lane_u32(val5, result1, 1);
+                    result1            = vsetq_lane_u32(val6, result1, 2);
+                    result1            = vsetq_lane_u32(val7, result1, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
+                }
+            },
+            input, output);
+    }
+
+    if (left_over_loop_y)
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
+        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
+
+        Iterator input(in, window_in);
+        Iterator output(out, window_out);
+
+        // Compute left-over elements along the y dimension (1x1)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
+    }
+}
+#else  // __aarch64__
+void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    const int window_step_x = 4;
+    const int window_step_y = 4;
+    const int window_start_x = window.x().start();
+    const int window_end_x = window.x().end();
+    const int window_start_y = window.y().start();
+    const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+
+    // Check if we need a left-over loop for the y dimension
+    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
+
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+    if (left_over_loop_y)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if (window_end_y_multiple_of > window_start_y)
+        {
+            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
+        }
+        else
+        {
+            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
+        }
+    }
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator output(out, window_out);
+
+    // Run the SIMD path if and only if the input is not a row-vector
+    if (in->info()->dimension(1) != 1)
+    {
+        Iterator input(in, window_in);
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint32x4_t row0 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4_t row1 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4_t row2 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4_t row3 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x2
+                    const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+                    const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+                    const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+                    const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    // Swap block 01 with block 10 and store
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+                }
+
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    uint32x4_t result = vdupq_n_u32(0);
+                    result = vsetq_lane_u32(val0, result, 0);
+                    result = vsetq_lane_u32(val1, result, 1);
+                    result = vsetq_lane_u32(val2, result, 2);
+                    result = vsetq_lane_u32(val3, result, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
+    }
+
+    if (left_over_loop_y)
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
+        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
+
+        Iterator input(in, window_in);
+        Iterator output(out, window_out);
+
+        // Compute left-over elements along the y dimension (1x1)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
+    }
+}
+#endif // __aarch64__
+} // namespace
+
+void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Destination auto inizialitation if not yet initialized
+    const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+
+    // Explicitly set the tensor shape to preserve dimensions
+    dst->set_tensor_shape(dst_shape);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst));
+
+    // Note: This kernel performs 16 elements per iteration.
+    // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory
+    // For this reason num_elems_processed_per_iteration_x is set to 1
+    const unsigned int num_elems_processed_per_iteration_x = 1;
+    const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
+
+    // Configure kernel window
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(dst->num_dimensions());
+    dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    // Error if input is not 8 bit, 16bit or 32bit
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4,
+                                    "Element size not supported");
+
+    // Validate configured destination
+    if (dst->total_size() != 0)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+
+void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    switch (src->info()->element_size())
+    {
+        case 1:
+            transpose_8bit_elements(src, dst, window);
+            break;
+        case 2:
+            transpose_16bit_elements(src, dst, window);
+            break;
+        case 4:
+            transpose_32bit_elements(src, dst, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+}
+
+const char *CpuTransposeKernel::name() const
+{
+    return "CpuTransposeKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h
new file mode 100644
index 0000000000..e79a405677
--- /dev/null
+++ b/src/cpu/kernels/CpuTransposeKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
+#define ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel which transposes the elements of a matrix */
+class CpuTransposeKernel : public ICpuKernel<CpuTransposeKernel>
+{
+public:
+    CpuTransposeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTransposeKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]  src Srouce tensor to permute. Data types supported: All
+     * @param[out] dst Destination tensor. Data types supported: Same as @p src
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuTransposeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..297ba63826
--- /dev/null
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
+{
+    TensorShape output_shape{src->tensor_shape()};
+
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
+
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    if (biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] ||
+                                                                     biases->dimension(1) != src->tensor_shape()[4]));
+    }
+
+    // Checks performed when output is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           get_output_shape(src, biases != nullptr));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst));
+
+    // Configure kernel
+    Window window = calculate_max_window(*src, Steps());
+    window.set(Window::DimX, Window::Dimension(0, src->dimension(0), src->dimension(0)));
+    window.set(Window::DimY, Window::Dimension(0, src->dimension(1), src->dimension(1)));
+    window.set(Window::DimZ, Window::Dimension(0, src->dimension(2), src->dimension(2)));
+    ICpuKernel::configure(window);
+}
+
+Status CpuWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst));
+    return Status{};
+}
+
+void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src    = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto biases = tensors.get_const_tensor(TensorType::ACL_BIAS);
+    auto dst    = tensors.get_tensor(TensorType::ACL_DST);
+
+    const unsigned int kernel_size_x   = src->info()->dimension(0);
+    const unsigned int kernel_size_y   = src->info()->dimension(1);
+    const unsigned int kernel_depth    = src->info()->dimension(2);
+    const unsigned int input_stride_x  = src->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = src->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = src->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = dst->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(src, window);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            // Get column index
+            const int kernel_idx = id[3];
+            const int kernel_idz = id[4];
+
+            // Setup pointers
+            const uint8_t *tmp_input_ptr        = in.ptr();
+            uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+            const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+            const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+            // Linearize volume
+            for (unsigned int d = 0; d < kernel_depth; ++d)
+            {
+                for (unsigned int j = 0; j < kernel_size_y; ++j)
+                {
+                    for (unsigned int i = 0; i < kernel_size_x; ++i)
+                    {
+                        std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
+                        tmp_input_ptr += input_stride_x;
+                        tmp_output_ptr += output_stride_y;
+                    }
+                    curr_input_row_ptr += input_stride_y;
+                    tmp_input_ptr = curr_input_row_ptr;
+                }
+                curr_input_depth_ptr += input_stride_z;
+                curr_input_row_ptr = curr_input_depth_ptr;
+                tmp_input_ptr      = curr_input_depth_ptr;
+            }
+
+            // Add bias
+            if (biases != nullptr)
+            {
+                std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)),
+                            src->info()->element_size());
+            }
+        },
+        in);
+}
+const char *CpuWeightsReshapeKernel::name() const
+{
+    return "CpuWeightsReshapeKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h
new file mode 100644
index 0000000000..9310b3c784
--- /dev/null
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
+#define ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CpuWeightsReshapeKernel : public ICpuKernel<CpuWeightsReshapeKernel>
+{
+public:
+    /** Default constructor */
+    CpuWeightsReshapeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWeightsReshapeKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  src    The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
+     *                    Data types supported: All
+     * @param[in]  biases The shared biases tensor info to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] dst    The output tensor info. Data types supported: Same as @p src
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuWeightsReshapeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
new file mode 100644
index 0000000000..52e3f2549c
--- /dev/null
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                             arm_conv::ConvolutionArgs        &_c_args,
+                                                                             uint32_t                          nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
+{
+}
+
+void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(window);
+    const ITensor *input_nhwc               = tensors.get_const_tensor(TensorType::ACL_SRC);
+    const ITensor *winograd_input_transform = tensors.get_const_tensor(TensorType::ACL_DST);
+    const ITensor *workspace                = tensors.get_const_tensor(TensorType::ACL_INT);
+
+    const unsigned int width_idx             = 1;
+    const unsigned int height_idx            = 2;
+    const unsigned int batch_idx             = 3;
+    int                element_size_in_bytes = input_nhwc->info()->element_size();
+    const auto         src_strides           = input_nhwc->info()->strides_in_bytes();
+
+    const size_t input_row_stride   = src_strides[height_idx] / element_size_in_bytes;
+    const size_t input_col_stride   = src_strides[width_idx] / element_size_in_bytes;
+    const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes;
+    const auto   input_nhwc_ptr =
+        reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
+    auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() +
+                                                   winograd_input_transform->info()->offset_first_element_in_bytes());
+
+    _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride,
+                                            input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec,
+                                            workspace->buffer(), info.thread_id, _nthreads);
+}
+
+CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                               arm_conv::ConvolutionArgs &_c_args,
+                                                                               uint32_t                   nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
+{
+}
+
+// Inherited methods overridden:
+void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(window);
+    const ITensor *dst_nhwc                  = tensors.get_const_tensor(TensorType::ACL_DST);
+    const ITensor *winograd_output_transform = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *biases                    = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const ITensor *workspace                 = tensors.get_tensor(TensorType::ACL_INT);
+
+    const unsigned int width_idx             = 1;
+    const unsigned int height_idx            = 2;
+    const unsigned int batch_idx             = 3;
+    const int          element_size_in_bytes = dst_nhwc->info()->element_size();
+    const auto         dst_strides           = dst_nhwc->info()->strides_in_bytes();
+
+    const size_t out_row_stride   = dst_strides[height_idx] / element_size_in_bytes;
+    const size_t out_col_stride   = dst_strides[width_idx] / element_size_in_bytes;
+    const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes;
+    const auto   wout_transf_ptr  = reinterpret_cast<const void *>(
+        winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
+    auto dst_nhwc_ptr =
+        reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
+    void *biases_data_ptr = nullptr;
+    if (biases != nullptr)
+    {
+        biases_data_ptr = reinterpret_cast<void *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
+    }
+
+    // Output transform
+    _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr,
+                                             dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride,
+                                             workspace->buffer(), info.thread_id, _nthreads);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h
new file mode 100644
index 0000000000..8a3b745e85
--- /dev/null
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H
+#define ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/NEON/kernels/assembly/winograd.hpp"
+#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuWinogradConv2dTransformInputKernel final : public ICpuKernel<CpuWinogradConv2dTransformInputKernel>
+{
+public:
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuWinogradConv2dTransformInputKernel(const CpuWinogradConv2dTransformInputKernel &) = delete;
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuWinogradConv2dTransformInputKernel &operator=(const CpuWinogradConv2dTransformInputKernel &) = delete;
+
+    /**  Prevent instances of this class from being moved it contains references.*/
+    CpuWinogradConv2dTransformInputKernel(CpuWinogradConv2dTransformInputKernel &&) = delete;
+
+    /**  Prevent instances of this class from being moved it contains references.*/
+    CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete;
+
+    CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                          arm_conv::ConvolutionArgs        &_c_args,
+                                          uint32_t                          nthreads);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+    const char *name() const override
+    {
+        return "CpuWinogradConv2dTransformInputKernel";
+    }
+
+private:
+    arm_conv::winograd::WinogradImpl &_winograd_impl;
+    arm_conv::ConvolutionArgs        &_conv_args;
+    uint32_t                          _nthreads;
+};
+class CpuWinogradConv2dTransformOutputKernel : public ICpuKernel<CpuWinogradConv2dTransformOutputKernel>
+{
+public:
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuWinogradConv2dTransformOutputKernel(const CpuWinogradConv2dTransformOutputKernel &) = delete;
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuWinogradConv2dTransformOutputKernel &operator=(const CpuWinogradConv2dTransformOutputKernel &) = delete;
+
+    /**  Prevent instances of this class from being moved it contains references.*/
+    CpuWinogradConv2dTransformOutputKernel(CpuWinogradConv2dTransformOutputKernel &&) = delete;
+
+    /**  Prevent instances of this class from being moved it contains references.*/
+    CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete;
+
+    CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                           arm_conv::ConvolutionArgs        &_c_args,
+                                           uint32_t                          nthreads);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+    const char *name() const override
+    {
+        return "CpuWinogradConv2dTransformOutputKernel";
+    }
+
+private:
+    arm_conv::winograd::WinogradImpl &_winograd_impl;
+    const arm_conv::ConvolutionArgs  &_conv_args;
+    uint32_t                          _nthreads;
+};
+
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H*/
diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..ddc6dc24cd
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8};
+} // namespace
+
+void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    fp_neon_activation_impl<float16_t, Fp16Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..e558f8c73e
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4};
+} // namespace
+void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    fp_neon_activation_impl<float, Fp32Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h
new file mode 100644
index 0000000000..afeb6f7f3d
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+namespace arm_compute
+{
+namespace cpu
+{
+/** Constant parameters needed by the activation implementation.
+ *  These parameters differ for each floating type
+ *
+ * @note This are passed as a struct as C++ does not allow float as a template parameter until C++20
+ **/
+struct ActFpImplParams
+{
+    float delta;  /**< Minimum delta needed to avoid NaN on corner-cases of elementary functions */
+    int   step_x; /**< Window step at the x dimension */
+};
+
+#ifndef __aarch64__
+inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
+{
+    auto int_in = vreinterpretq_u32_f32(in);
+    return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
+}
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
+{
+    auto int_in = vreinterpretq_u16_f16(in);
+    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
+}
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#endif /* __aarch64__ */
+
+template <typename T, const ActFpImplParams &P>
+void fp_neon_activation_impl(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType =
+        typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    constexpr int                                 window_step_x  = P.step_x;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+    Window                                        win_collapsed  = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    // In case of non-aarch64, a small delta value is added to the input
+    // to prevent NAN values caused by zeros in inputs to SQRT.
+    // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{});
+#else  /* #ifndef __aarch64__ */
+    const auto const_inv_2      = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
+    const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
+#endif /* __aarch64__ */
+    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+    const auto      const_0           = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+    const auto      const_6           = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
+    const auto      const_3           = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
+    const auto      const_inv_6       = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{});
+    constexpr float soft_relu_thresh  = 12.f;
+    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{});
+    const auto      va                = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{});
+    const auto      vb                = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
+    const auto      a                 = static_cast<T>(act_info.a());
+    const auto      b                 = static_cast<T>(act_info.b());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+            wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = wrapper::vabs(vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = wrapper::vmla(vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = wrapper::vmax(const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin,
+                                            wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
+                                            wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+                        tmp = wrapper::vsqrt(vin);
+#else  /* __aarch64__ */
+                    {
+                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
+                        tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                    }
+#endif /* __aarch64__ */
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = wrapper::vmul(vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_6,
+                                          wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(
+                                                     const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
+                        break;
+#ifdef __aarch64__
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_2,
+                                          wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
+                        break;
+#endif /* __aarch64__ */
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+                T       tmp;
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = std::abs(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = a * in + b;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = std::max<T>(static_cast<T>(0), in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max<T>(b, in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = (in > 0) ? in : a * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = std::sqrt(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = in * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = a * std::tanh(b * in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = in / (static_cast<T>(1) + std::exp(-a * in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
new file mode 100644
index 0000000000..ddd186f9cb
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifdef __aarch64__
+void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+        (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
+    const auto window_end_x  = window.x().end();
+    Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+            lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
+}
+#endif // __aarch64__
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..1451301ea2
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const qasymm8x16_t            va       = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+    const qasymm8x16_t            vb       = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+    const qasymm8_t               a        = quantize_qasymm8(act_info.a(), qi_in);
+    const qasymm8_t               b        = quantize_qasymm8(act_info.b(), qi_in);
+    const qasymm8_t               const_0  = quantize_qasymm8(0.f, qi_in);
+    const qasymm8x16_t            vconst_0 = vdupq_n_u8(const_0);
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+
+#ifndef __aarch64__
+    const auto vconst_0_f32 = vdupq_n_f32(0);
+#else  // #ifndef __aarch64__
+    const auto const_inv_2      = vdupq_n_f32(0.5f);
+    const auto const_inv_sqrt_2 = vdupq_n_f32(0.70710678118f);
+#endif // __aarch64__
+    const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
+    const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
+    const float       a_f32  = act_info.a();
+    const float       b_f32  = act_info.b();
+
+#ifndef __aarch64__
+    const auto const_6_f32     = vdupq_n_f32(6.f);
+    const auto const_0_f32     = vdupq_n_f32(0.f);
+    const auto const_3_f32     = vdupq_n_f32(3.f);
+    const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+#endif // __aarch64__
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+            wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = vmaxq_u8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+#endif // __aarch64__
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    const auto vin_deq = vdequantize(vin, qi_in);
+
+                    const uint32x4x4_t pos_mask = {{
+                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+                    }};
+
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+                    }};
+
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+#else  // #ifndef __aarch64__
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+                {
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(vin_deq.val[0],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[0], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[1],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[1], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[2],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[2], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[3],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[3], const_inv_sqrt_2))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+#endif // __aarch64__
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+#endif // __aarch64__
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+#endif // __aarch64__
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..a2f588245a
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const qasymm8x16_signed_t     va       = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+    const qasymm8x16_signed_t     vb       = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+    const qasymm8_signed_t        a        = quantize_qasymm8_signed(act_info.a(), qi_in);
+    const qasymm8_signed_t        b        = quantize_qasymm8_signed(act_info.b(), qi_in);
+    const qasymm8_signed_t        const_0  = quantize_qasymm8_signed(0.f, qi_in);
+    const qasymm8x16_signed_t     vconst_0 = vdupq_n_s8(const_0);
+#ifndef __aarch64__
+    const auto vconst_1     = vdupq_n_f32(1.f);
+    const auto vconst_0_f32 = vdupq_n_f32(0.f);
+#endif // __aarch64__
+    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
+    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
+    const float       a_f32           = act_info.a();
+    const float       b_f32           = act_info.b();
+    const auto        const_6_f32     = vdupq_n_f32(6.f);
+    const auto        const_0_f32     = vdupq_n_f32(0.f);
+    const auto        const_3_f32     = vdupq_n_f32(3.f);
+    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+
+            wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = vmaxq_s8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+#endif // __aarch64__
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+                    const uint32x4x4_t pos_mask = {{
+                        wrapper::vcgtz(vin_deq.val[0]),
+                        wrapper::vcgtz(vin_deq.val[1]),
+                        wrapper::vcgtz(vin_deq.val[2]),
+                        wrapper::vcgtz(vin_deq.val[3]),
+                    }};
+#else  // __aarch64__
+                    const uint32x4x4_t pos_mask = {{
+                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+                    }};
+#endif // __aarch64__
+
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+                    }};
+
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+#endif // __aarch64__
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
new file mode 100644
index 0000000000..891646ea00
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
+{
+    constexpr int                                 window_step_x  = 8;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+    const float32x4_t             va_f32   = vdupq_n_f32(act_info.a());
+    const float32x4_t             vb_f32   = vdupq_n_f32(act_info.b());
+    const float                   a_f32    = act_info.a();
+    const float                   b_f32    = act_info.b();
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+
+            wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+            ARM_COMPUTE_UNUSED(tmp);
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
+                                                    wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+                qsymm16_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
new file mode 100644
index 0000000000..19d9126556
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2020-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/cpu/kernels/lut/list.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const auto const_1     = svdup_n_f16(1.f);
+    const auto const_0     = svdup_n_f16(0.f);
+    const auto const_6     = svdup_n_f16(6.f);
+    const auto const_3     = svdup_n_f16(3.f);
+    const auto const_inv_6 = svdup_n_f16(0.166666667f);
+
+    const auto va = svdup_n_f16(act_info.a());
+    const auto vb = svdup_n_f16(act_info.b());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            svfloat16_t tmp;
+
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                const auto vin = svld1_f16(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f16_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f16_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
+                                          svmax_f16_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
+                                        svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f16_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svmul_f16_z(
+                                pg, const_inv_6,
+                                svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svinv_f16_z(pg, svadd_f16_z(pg, const_1,
+                                                        svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f16(pg, output_ptr + x, tmp);
+
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
+
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
+}
+
+void sve_fp16_activation_lut(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
+{
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::F16);
+    const auto window_start_x = window.x().start();
+    const auto window_end_x   = window.x().end();
+    const auto size           = window_end_x - window_start_x;
+    Window     win_collapsed  = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint16_t *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+            lut_u16_sve(reinterpret_cast<const uint16_t *>(act_info.lut_fp16().data()), 1U /* num_strings (UNUSED) */,
+                        size, input_ptr + window_start_x, output_ptr + window_start_x);
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
new file mode 100644
index 0000000000..d1b075d52c
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEMath.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const auto const_1          = svdup_n_f32(1.f);
+    const auto const_0          = svdup_n_f32(0.f);
+    const auto const_6          = svdup_n_f32(6.f);
+    const auto const_3          = svdup_n_f32(3.f);
+    const auto const_inv_6      = svdup_n_f32(0.166666667f);
+    const auto soft_relu_thresh = svdup_n_f32(16.63553047f);
+
+    const auto va = svdup_n_f32(act_info.a());
+    const auto vb = svdup_n_f32(act_info.b());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            svfloat32_t tmp;
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                const auto vin = svld1_f32(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f32_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f32_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va),
+                                          svmax_f32_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin,
+                                        svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin,
+                                        svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f32_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svmul_f32_z(
+                                pg, const_inv_6,
+                                svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svinv_f32_z(pg, svadd_f32_z(pg, const_1,
+                                                        svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f32(pg, output_ptr + x, tmp);
+
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
new file mode 100644
index 0000000000..5db8595a75
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifdef __aarch64__
+void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+        (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
+    const auto window_end_x  = window.x().end();
+    Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = input.ptr();
+            auto       output_ptr = output.ptr();
+            lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
+}
+#endif // __aarch64__
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
new file mode 100644
index 0000000000..7efa9e4b72
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    va       = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+    const auto                    vb       = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+    const auto                    const_0  = quantize_qasymm8(0.f, qi_in);
+    const auto                    vconst_0 = svdup_n_u8(const_0);
+    const auto                    vconst_1 = svdup_n_f32(1.f);
+    const auto                    va_f32   = svdup_n_f32(act_info.a());
+    const auto                    vb_f32   = svdup_n_f32(act_info.b());
+
+    // Initialise scale/offset for re-quantization
+    bool requant = true;
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    {
+        requant = false;
+    }
+    float s  = qi_in.scale / qi_out.scale;
+    float o  = -qi_in.offset * s + qi_out.offset;
+    auto  vs = svdup_n_f32(s);
+    auto  vo = svdup_n_f32(o);
+
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto voffset_in = svdup_n_s32(qi_in.offset);
+    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_s32     = svdup_n_s32(s_s32);
+    const auto vo_s32     = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+            svuint8_t tmp;
+
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                const auto vin = svld1_u8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = svmax_u8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
+
+                    // Expand to int32
+                    const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
+
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+
+                svst1_u8(pg, output_ptr + x, tmp);
+
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
+
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..e4667522dd
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
+    const auto                    va              = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+    const auto                    vb              = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+    const auto                    const_0         = quantize_qasymm8_signed(0.f, qi_in);
+    const auto                    vconst_0        = svdup_n_s8(const_0);
+    const auto                    vconst_1        = svdup_n_f32(1.f);
+    const auto                    va_f32          = svdup_n_f32(act_info.a());
+    const auto                    vb_f32          = svdup_n_f32(act_info.b());
+    const auto                    const_6_f32     = svdup_n_f32(6.f);
+    const auto                    const_0_f32     = svdup_n_f32(0.f);
+    const auto                    const_3_f32     = svdup_n_f32(3.f);
+    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    bool requant = true;
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    {
+        requant = false;
+    }
+    float s  = qi_in.scale / qi_out.scale;
+    float o  = -qi_in.offset * s + qi_out.offset;
+    auto  vs = svdup_n_f32(s);
+    auto  vo = svdup_n_f32(o);
+
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto voffset_in = svdup_n_s32(qi_in.offset);
+    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_s32     = svdup_n_s32(s_s32);
+    const auto vo_s32     = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+            svint8_t tmp;
+
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                const auto vin = svld1_s8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = svmax_s8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 0),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 0),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 1),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 1),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 2),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 2),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 3),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 3),
+                                                                                    const_3_f32))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
+
+                    // Expand to int32
+                    const svint32x4_t vin_s32 =
+                        svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
+                                      svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
+
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+
+                svst1_s8(pg, output_ptr + x, tmp);
+
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
+
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
new file mode 100644
index 0000000000..f955893307
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/SVESymm.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    vconst_1 = svdup_n_f32(1.f);
+    const auto                    va_f32   = svdup_n_f32(act_info.a());
+    const auto                    vb_f32   = svdup_n_f32(act_info.b());
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+            svint16_t tmp;
+
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                const auto vin = svld1_s16(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep =
+                        svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
+                                      svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+
+                svst1_s16(pg, output_ptr + x, tmp);
+
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
+
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h
new file mode 100644
index 0000000000..8c24adc3fe
--- /dev/null
+++ b/src/cpu/kernels/activation/list.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
+#define ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ACTIVATION_KERNEL(func_name) \
+    void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+
+#ifdef __aarch64__
+DECLARE_ACTIVATION_KERNEL(neon_q8_activation_lut);
+#endif // __aarch64__
+DECLARE_ACTIVATION_KERNEL(sve2_q8_activation_lut);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation_lut);
+DECLARE_ACTIVATION_KERNEL(sve_fp32_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
+
+#undef DECLARE_ACTIVATION_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..e7679c14e3
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/fp16.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<float16_t>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..11a970bef4
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/fp32.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<float>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp
new file mode 100644
index 0000000000..34938cc4c4
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/impl.cpp
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+namespace arm_compute
+{
+namespace cpu
+{
+bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, false);
+}
+
+bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true);
+}
+
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition)
+{
+    const auto iq0 = src0->quantization_info().uniform();
+    const auto iq1 = src1->quantization_info().uniform();
+    const auto oq  = dst->quantization_info().uniform();
+
+    const auto scale0 = iq0.scale / oq.scale;
+    const auto scale1 = iq1.scale / oq.scale;
+
+    if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
+    {
+        // The scale factor cannot be stored as 5.11 signed fixed-point number.
+        return false;
+    }
+
+    const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
+
+    const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset))
+                                     : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
+
+    if (max_acc > 1048575.f) // 2^20 - 1
+    {
+        // It might not be possible to store the result as 21.11 signed fixed-point number.
+        return false;
+    }
+
+    return true;
+}
+
+template <typename ScalarType>
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    add_sub_q8_neon_fixedpoint<ScalarType>(src0, src1, dst, policy, window, true /*is_addition*/);
+}
+
+template <typename ScalarType>
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    const auto in0_info = src0->info();
+    const auto in1_info = src1->info();
+
+    const auto &in0_shape = in0_info->tensor_shape();
+    const auto &in1_shape = in1_info->tensor_shape();
+
+    // Create input windows.
+    Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
+    Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
+
+    // Clear the x dimension on the execution window as we process the whole row each iteration.
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16;
+    const auto    window_start_x        = window.x().start();
+    const auto    window_end_x          = window.x().end();
+    const auto    is_broadcast_across_x = in0_shape.x() != in1_shape.x();
+
+    const auto iq0_info  = in0_info->quantization_info().uniform();
+    const auto iq1_info  = in1_info->quantization_info().uniform();
+    const auto oq_info   = dst->info()->quantization_info().uniform();
+    const auto in0_scale = iq0_info.scale / oq_info.scale;
+    const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale));
+    const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
+
+    constexpr float _2pow11        = 2048;
+    const auto      in0_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in0_scale * _2pow11));
+    const auto      in1_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in1_scale * _2pow11));
+    const auto      offset_21p11   = static_cast<int32_t>(support::cpp11::lround(offset * _2pow11));
+
+    constexpr uint8_t shift_amount_remainder = 3;
+
+    if (is_broadcast_across_x)
+    {
+        // Prefix: a = non-broadcast, b = broadcast.
+
+        const auto is_broadcast_input_1 = in1_win.x().step() == 0;
+        auto       a_win                = is_broadcast_input_1 ? in0_win : in1_win;
+        auto       b_win                = is_broadcast_input_1 ? in1_win : in0_win;
+        const auto a_tensor             = is_broadcast_input_1 ? src0 : src1;
+        const auto b_tensor             = is_broadcast_input_1 ? src1 : src0;
+
+        const auto a_scale_5p11  = is_broadcast_input_1 ? in0_scale_5p11 : in1_scale_5p11;
+        const auto b_scale       = is_broadcast_input_1 ? in1_scale : in0_scale;
+        const auto a_vscale_5p11 = wrapper::vdup_n(a_scale_5p11, wrapper::traits::vector_64_tag());
+
+#ifndef __aarch64__
+        const auto a_scale = is_broadcast_input_1 ? in0_scale : in1_scale;
+#endif // __aarch64__
+
+        // Clear the x dimension on the execution window as we process the whole row each iteration.
+        a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator a_input_it(a_tensor, a_win);
+        Iterator b_input_it(b_tensor, b_win);
+        Iterator out_it(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                const auto b_val                   = *b_ptr;
+                const auto b_scaled                = b_scale * b_val;
+                const auto b_scaled_21p11          = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
+                const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11;
+                const auto b_vscaled_offseted_21p11 =
+                    wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
+
+#ifndef __aarch64__
+                const auto b_scaled_offseted = b_scaled + offset;
+#endif // __aarch64__
+
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the input.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+                    // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
+
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
+#ifdef __aarch64__
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
+#else  // __aarch64__
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
+#endif // __aarch64__
+                }
+            },
+            b_input_it, a_input_it, out_it);
+    }
+    else
+    {
+        const auto vscale0_5p11  = wrapper::vdup_n(in0_scale_5p11, wrapper::traits::vector_64_tag());
+        const auto vscale1_5p11  = wrapper::vdup_n(in1_scale_5p11, wrapper::traits::vector_64_tag());
+        const auto voffset_21p11 = wrapper::vdup_n(offset_21p11, wrapper::traits::vector_128_tag());
+
+        // Clear the x dimension on the execution window as we process the whole row each iteration.
+        in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator in0_it(src0, in0_win);
+        Iterator in1_it(src1, in1_win);
+        Iterator out_it(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+                    // Multiply the input elements by the scale factor and add the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vscaled0_offseted_21p11_00 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_01 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_10 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_11 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
+
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
+
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
+#ifdef __aarch64__
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
+#else  // __aarch64__
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
+#endif // __aarch64__
+                }
+            },
+            in0_it, in1_it, out_it);
+    }
+}
+
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16;
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
+
+    const auto scale1 = iq1_info.scale / oq_info.scale;
+    const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
+    const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        const auto af_scale = is_broadcast_input_2 ? scale1 : scale2;
+        const auto bf_scale = is_broadcast_input_2 ? scale2 : scale1;
+        const auto vscale1  = vdupq_n_f32(af_scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
+                const auto output_ptr              = output.ptr();
+
+                const auto broadcast_value = *broadcast_input.ptr();
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
+
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
+
+#ifdef __aarch64__
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
+#else  //__aarch64__
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
+#endif //__aarch64__
+
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+#ifdef __aarch64__
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+#else  // __aarch64__
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+#endif // __aarch64__
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        const auto vscale1 = vdupq_n_f32(scale1);
+        const auto vscale2 = vdupq_n_f32(scale2);
+        const auto voffset = vdupq_n_f32(offset);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = input1.ptr();
+                const auto input2_ptr = input2.ptr();
+                const auto output_ptr = output.ptr();
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(input1_ptr + x);
+                    const uint8x16_t b = vld1q_u8(input2_ptr + x);
+
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+                    const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
+                    const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
+
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
+
+#ifdef __aarch64__
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
+#else  //__aarch64__
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
+#endif //__aarch64__
+
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+#ifdef __aarch64__
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+#else  // __aarch64__
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+#endif // __aarch64__
+                }
+            },
+            input1, input2, output);
+    }
+}
+
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16;
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
+
+    const auto scale1 = iq1_info.scale / oq_info.scale;
+    const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
+    const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        const auto af_scale = is_broadcast_input_2 ? scale1 : scale2;
+        const auto bf_scale = is_broadcast_input_2 ? scale2 : scale1;
+        const auto vscale1  = vdupq_n_f32(af_scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+
+                const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
+
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
+
+#ifdef __aarch64__
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
+#else  //__aarch64__
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
+#endif //__aarch64__
+
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+#ifdef __aarch64__
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+#else  // __aarch64__
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+#endif // __aarch64__
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        const auto vscale1 = vdupq_n_f32(scale1);
+        const auto vscale2 = vdupq_n_f32(scale2);
+        const auto voffset = vdupq_n_f32(offset);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(input1_ptr + x);
+                    const int8x16_t b = vld1q_s8(input2_ptr + x);
+
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+                    const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
+                    const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
+
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
+
+#ifdef __aarch64__
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
+#else  //__aarch64__
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
+#endif //__aarch64__
+
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+#ifdef __aarch64__
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+#else  // __aarch64__
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+#endif // __aarch64__
+                }
+            },
+            input1, input2, output);
+    }
+}
+
+template void add_q8_neon_fixedpoint<int8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_q8_neon_fixedpoint<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor       *src0,
+                                                 const ITensor       *src1,
+                                                 ITensor             *dst,
+                                                 const ConvertPolicy &policy,
+                                                 const Window        &window,
+                                                 bool                 is_addition);
+template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor       *src0,
+                                                  const ITensor       *src1,
+                                                  ITensor             *dst,
+                                                  const ConvertPolicy &policy,
+                                                  const Window        &window,
+                                                  bool                 is_addition);
+
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h
new file mode 100644
index 0000000000..faa99baffe
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/impl.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_ADD_IMPL_H
+#define SRC_CORE_NEON_KERNELS_ADD_IMPL_H
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(ScalarType);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    const auto res             = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v)
+                                                     : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::add_sat(broadcast_value, non_broadcast_v)
+                                                     : broadcast_value + non_broadcast_v;
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1 = *(input1_ptr + x);
+                    const auto val2 = *(input2_ptr + x);
+                    *(output_ptr + x) =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
+                }
+            },
+            input1, input2, output);
+    }
+}
+
+bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition);
+
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
+
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
+
+template <typename ScalarType>
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+template <typename ScalarType>
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition);
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp
new file mode 100644
index 0000000000..f0bcebc9d2
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/integer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_u8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<uint8_t>(src0, src1, dst, policy, window);
+}
+
+void add_s16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<int16_t>(src0, src1, dst, policy, window);
+}
+
+void add_s32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<int32_t>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..8195d229d9
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..7e23096239
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
new file mode 100644
index 0000000000..ac2de0557a
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 8;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
+
+    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
+    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
+
+    if (is_broadcast_across_x)
+    {
+        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+                const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
+                const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
+                const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
+                    const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+#ifdef __aarch64__
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+#else  //__aarch64__
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+#endif //__aarch64__
+
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                    const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+                    const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
+                    const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+#ifdef __aarch64__
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+#else  //__aarch64__
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+#endif //__aarch64__
+
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp
new file mode 100644
index 0000000000..01dfe6c44b
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/fp16.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<float16_t>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp
new file mode 100644
index 0000000000..56771a5411
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/fp32.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<float>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp
new file mode 100644
index 0000000000..ca850fcef4
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/impl.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+#include <arm_sve.h>
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    const auto all_true_pg           = wrapper::svptrue<ScalarType>();
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+    const bool is_sat                = (policy == ConvertPolicy::SATURATE);
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
+    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
+    Iterator output(dst, window);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
+
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v)
+                                                        : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto val1 = svld1(pg, input1_ptr + x);
+                    const auto val2 = svld1(pg, input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
+    }
+}
+template void add_same_sve<float>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int32_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void add_same_sve<float16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h
new file mode 100644
index 0000000000..6a95d66826
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/impl.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H
+#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp
new file mode 100644
index 0000000000..4d17f2adbd
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/integer.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_u8_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<uint8_t>(src0, src1, dst, policy, window);
+}
+
+void add_s16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<int16_t>(src0, src1, dst, policy, window);
+}
+
+void add_s32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<int32_t>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
new file mode 100644
index 0000000000..40add9d51b
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_qasymm8_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+    const auto all_true_pg           = svptrue_b8();
+
+    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
+
+    const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
+    const auto voffseto   = svdup_n_f32(oq_info.offset);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        const svfloat32_t vscale1  = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
+        const svfloat32_t vscale2  = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
+        const svint32_t   voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
+        const svint32_t   voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+
+                const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+
+                const auto bf_0 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+
+                do
+                {
+                    const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        const auto vscale1  = svdup_n_f32(iq1_info.scale);
+        const auto vscale2  = svdup_n_f32(iq2_info.scale);
+        const auto voffset1 = svdup_n_s32(iq1_info.offset);
+        const auto voffset2 = svdup_n_s32(iq2_info.offset);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a    = svld1_u8(pg, input1_ptr + x);
+                    const auto b    = svld1_u8(pg, input2_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+
+                    const auto bf_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
+
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..2e585115e1
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_qasymm8_signed_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
+
+    const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
+    const auto voffseto   = svdup_n_f32(oq_info.offset);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+        const auto     all_true_pg          = svptrue_b8();
+
+        const auto vscale1  = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
+        const auto vscale2  = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
+        const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
+        const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+
+                const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
+
+                int        x    = window_start_x;
+                svbool_t   pg   = svwhilelt_b8(x, window_end_x);
+                const auto bf_0 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+
+                do
+                {
+                    const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        const auto vscale1  = svdup_n_f32(iq1_info.scale);
+        const auto vscale2  = svdup_n_f32(iq2_info.scale);
+        const auto voffset1 = svdup_n_s32(iq1_info.offset);
+        const auto voffset2 = svdup_n_s32(iq2_info.offset);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a = svld1_s8(pg, input1_ptr + x);
+                    const auto b = svld1_s8(pg, input2_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+                    const auto bf_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
+
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(svptrue_b8(), pg));
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
new file mode 100644
index 0000000000..17a42c2138
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_qsymm16_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
+
+    const auto vscale1     = svdup_n_f32(iq1_info.scale);
+    const auto vscale2     = svdup_n_f32(iq2_info.scale);
+    const auto invvscaleo  = svdup_n_f32(1.f / oq_info.scale);
+    const auto all_true_pg = svptrue_b16();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+
+                const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
+
+                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
+                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
+
+                do
+                {
+                    const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+
+                    svst1_s16(pg, output_ptr + x, res);
+
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
+                do
+                {
+                    auto a = svld1_s16(pg, input1_ptr + x);
+                    auto b = svld1_s16(pg, input2_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+
+                    const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
+                    const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
+
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    svst1_s16(pg, output_ptr + x, res);
+
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h
new file mode 100644
index 0000000000..1040c39a41
--- /dev/null
+++ b/src/cpu/kernels/add/list.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_ADD_LIST_H
+#define SRC_CORE_KERNELS_ADD_LIST_H
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ADD_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
+
+DECLARE_ADD_KERNEL(add_qasymm8_neon);
+DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
+DECLARE_ADD_KERNEL(add_qsymm16_neon);
+DECLARE_ADD_KERNEL(add_fp32_neon);
+DECLARE_ADD_KERNEL(add_fp16_neon);
+DECLARE_ADD_KERNEL(add_u8_neon);
+DECLARE_ADD_KERNEL(add_s16_neon);
+DECLARE_ADD_KERNEL(add_s32_neon);
+DECLARE_ADD_KERNEL(add_fp32_sve);
+DECLARE_ADD_KERNEL(add_fp16_sve);
+DECLARE_ADD_KERNEL(add_u8_sve);
+DECLARE_ADD_KERNEL(add_s16_sve);
+DECLARE_ADD_KERNEL(add_s32_sve);
+DECLARE_ADD_KERNEL(add_qasymm8_sve2);
+DECLARE_ADD_KERNEL(add_qasymm8_signed_sve2);
+DECLARE_ADD_KERNEL(add_qsymm16_sve2);
+
+#undef DECLARE_ADD_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_KERNELS_ADD_LIST_H
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..b4b81aa78b
--- /dev/null
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
@@ -0,0 +1,965 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/CpuTypes.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+namespace
+{
+using arm_compute::float16_t;
+
+void a64_add_bn_clamp_direct_fp16_2x32(float16_t       *out,
+                                       size_t           out_stride,
+                                       float16_t       *out_direct,
+                                       size_t           out_direct_stride,
+                                       const float16_t *in0,
+                                       size_t           in0_stride,
+                                       const float16_t *in1,
+                                       size_t           in1_stride,
+                                       const float16_t *bn_mul,
+                                       const float16_t *bn_add,
+                                       const float16_t  minval,
+                                       const float16_t  maxval,
+                                       size_t           width,
+                                       size_t           height)
+{
+    struct KernelArgs
+    {
+        float16_t minval;
+        float16_t maxval;
+    } ka;
+    ka.minval = minval;
+    ka.maxval = maxval;
+
+    __asm__ __volatile__(
+        "ldr w21, [%x[args_ptr], %[offsetof_minval]]\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_maxval]]\n"
+        "cmp %x[width], #0x20\n"
+        "dup v13.8h, w21\n"
+        "dup v12.8h, w20\n"
+        "blt 7f\n"
+        "1:" // Column loop
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x12, %x[in0]\n"
+        "mov x11, %x[in1]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x10, %x[out]\n"
+        "mov x9, %x[out_direct]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x20, %x[height]\n"
+        "mov x28, x12\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "mov x27, x11\n"
+        "mov x26, x10\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q10, [x27, #0x0]\n"
+        "mov x25, x9\n"
+        "add x24, x28, %x[in0_stride]\n"
+        "ldr q9, [x28, #0x10]\n"
+        "ldr q8, [x27, #0x10]\n"
+        "add x23, x27, %x[in1_stride]\n"
+        "add x22, x26, %x[out_stride]\n"
+        "ldr q7, [x28, #0x20]\n"
+        "ldr q6, [x27, #0x20]\n"
+        "add x21, x25, %x[out_direct_stride]\n"
+        "cmp x20, #0x2\n"
+        "ldr q5, [x28, #0x30]\n"
+        "ldr q4, [x27, #0x30]\n"
+        "add x12, x24, %x[in0_stride]\n"
+        "add x11, x23, %x[in1_stride]\n"
+        "add x10, x22, %x[out_stride]\n"
+        "add x9, x21, %x[out_direct_stride]\n"
+        "csel x24, x24, x28, GE\n"
+        "csel x23, x23, x27, GE\n"
+        "csel x22, x22, x26, GE\n"
+        "csel x21, x21, x25, GE\n"
+        "subs x20, x20, #0x2\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "add x28, x28, #0x40\n"
+        "add x27, x27, #0x40\n"
+        "ble 4f\n"
+        "2:" // Row loop
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "fadd v2.8h, v11.8h, v10.8h\n"
+        "fadd v1.8h, v9.8h, v8.8h\n"
+        "ldr q21, [x24, #0x10]\n"
+        "ldr q20, [x23, #0x10]\n"
+        "fadd v0.8h, v7.8h, v6.8h\n"
+        "fadd v23.8h, v5.8h, v4.8h\n"
+        "ldr q19, [x24, #0x20]\n"
+        "ldr q18, [x23, #0x20]\n"
+        "fadd v22.8h, v3.8h, v22.8h\n"
+        "fadd v21.8h, v21.8h, v20.8h\n"
+        "ldr q17, [x24, #0x30]\n"
+        "ldr q16, [x23, #0x30]\n"
+        "fadd v20.8h, v19.8h, v18.8h\n"
+        "fadd v19.8h, v17.8h, v16.8h\n"
+        "add x24, x24, #0x40\n"
+        "add x23, x23, #0x40\n"
+        "cbz %x[out_direct], 3f\n"
+        "str q2, [x25, #0x0]\n"
+        "str q1, [x25, #0x10]\n"
+        "str q0, [x25, #0x20]\n"
+        "str q23, [x25, #0x30]\n"
+        "add x25, x25, #0x40\n"
+        "str q22, [x21, #0x0]\n"
+        "str q21, [x21, #0x10]\n"
+        "str q20, [x21, #0x20]\n"
+        "str q19, [x21, #0x30]\n"
+        "add x21, x21, #0x40\n"
+        "3:" // Main loop: No direct output
+        "mov v16.16b, v2.16b\n"
+        "mov v2.16b, v28.16b\n"
+        "fmla v2.8h, v16.8h, v24.8h\n"
+        "mov x28, x12\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q9, [x28, #0x10]\n"
+        "mov v18.16b, v1.16b\n"
+        "mov v1.16b, v29.16b\n"
+        "ldr q7, [x28, #0x20]\n"
+        "ldr q5, [x28, #0x30]\n"
+        "mov v17.16b, v0.16b\n"
+        "mov v0.16b, v30.16b\n"
+        "mov v16.16b, v23.16b\n"
+        "mov v23.16b, v31.16b\n"
+        "fmla v1.8h, v18.8h, v25.8h\n"
+        "mov x27, x11\n"
+        "ldr q10, [x27, #0x0]\n"
+        "ldr q8, [x27, #0x10]\n"
+        "fmla v0.8h, v17.8h, v26.8h\n"
+        "fmla v23.8h, v16.8h, v27.8h\n"
+        "ldr q6, [x27, #0x20]\n"
+        "ldr q4, [x27, #0x30]\n"
+        "mov v17.16b, v22.16b\n"
+        "mov v22.16b, v28.16b\n"
+        "mov v16.16b, v21.16b\n"
+        "mov v21.16b, v29.16b\n"
+        "fmla v22.8h, v17.8h, v24.8h\n"
+        "mov x25, x9\n"
+        "mov v17.16b, v20.16b\n"
+        "mov v20.16b, v30.16b\n"
+        "fmla v21.8h, v16.8h, v25.8h\n"
+        "add x24, x28, %x[in0_stride]\n"
+        "mov v16.16b, v19.16b\n"
+        "mov v19.16b, v31.16b\n"
+        "fmla v20.8h, v17.8h, v26.8h\n"
+        "add x23, x27, %x[in1_stride]\n"
+        "fmla v19.8h, v16.8h, v27.8h\n"
+        "fmin v2.8h, v2.8h, v12.8h\n"
+        "add x21, x25, %x[out_direct_stride]\n"
+        "cmp x20, #0x2\n"
+        "fmin v1.8h, v1.8h, v12.8h\n"
+        "fmin v0.8h, v0.8h, v12.8h\n"
+        "add x12, x24, %x[in0_stride]\n"
+        "add x11, x23, %x[in1_stride]\n"
+        "fmin v23.8h, v23.8h, v12.8h\n"
+        "fmax v2.8h, v2.8h, v13.8h\n"
+        "str q2, [x26, #0x0]\n"
+        "add x9, x21, %x[out_direct_stride]\n"
+        "fmax v1.8h, v1.8h, v13.8h\n"
+        "fmax v0.8h, v0.8h, v13.8h\n"
+        "str q1, [x26, #0x10]\n"
+        "csel x24, x24, x28, GE\n"
+        "fmax v23.8h, v23.8h, v13.8h\n"
+        "fmin v22.8h, v22.8h, v12.8h\n"
+        "str q0, [x26, #0x20]\n"
+        "csel x23, x23, x27, GE\n"
+        "fmin v21.8h, v21.8h, v12.8h\n"
+        "fmin v20.8h, v20.8h, v12.8h\n"
+        "str q23, [x26, #0x30]\n"
+        "mov x26, x10\n"
+        "fmin v19.8h, v19.8h, v12.8h\n"
+        "fmax v22.8h, v22.8h, v13.8h\n"
+        "str q22, [x22, #0x0]\n"
+        "csel x21, x21, x25, GE\n"
+        "fmax v21.8h, v21.8h, v13.8h\n"
+        "fmax v20.8h, v20.8h, v13.8h\n"
+        "str q21, [x22, #0x10]\n"
+        "add x28, x28, #0x40\n"
+        "fmax v19.8h, v19.8h, v13.8h\n"
+        "str q20, [x22, #0x20]\n"
+        "add x27, x27, #0x40\n"
+        "str q19, [x22, #0x30]\n"
+        "add x22, x26, %x[out_stride]\n"
+        "add x10, x22, %x[out_stride]\n"
+        "csel x22, x22, x26, GE\n"
+        "subs x20, x20, #0x2\n"
+        "bgt 2b\n"
+        "4:" // Row loop skip
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "fadd v2.8h, v11.8h, v10.8h\n"
+        "fadd v1.8h, v9.8h, v8.8h\n"
+        "ldr q21, [x24, #0x10]\n"
+        "ldr q20, [x23, #0x10]\n"
+        "fadd v0.8h, v7.8h, v6.8h\n"
+        "fadd v23.8h, v5.8h, v4.8h\n"
+        "ldr q19, [x24, #0x20]\n"
+        "ldr q18, [x23, #0x20]\n"
+        "fadd v22.8h, v3.8h, v22.8h\n"
+        "fadd v21.8h, v21.8h, v20.8h\n"
+        "ldr q17, [x24, #0x30]\n"
+        "ldr q16, [x23, #0x30]\n"
+        "fadd v20.8h, v19.8h, v18.8h\n"
+        "fadd v19.8h, v17.8h, v16.8h\n"
+        "add x24, x24, #0x40\n"
+        "add x23, x23, #0x40\n"
+        "cbz %x[out_direct], 5f\n"
+        "str q2, [x25, #0x0]\n"
+        "str q1, [x25, #0x10]\n"
+        "str q0, [x25, #0x20]\n"
+        "str q23, [x25, #0x30]\n"
+        "add x25, x25, #0x40\n"
+        "str q22, [x21, #0x0]\n"
+        "str q21, [x21, #0x10]\n"
+        "str q20, [x21, #0x20]\n"
+        "str q19, [x21, #0x30]\n"
+        "add x21, x21, #0x40\n"
+        "5:" // Tail loop: No direct output
+        "mov v16.16b, v2.16b\n"
+        "mov v2.16b, v28.16b\n"
+        "fmla v2.8h, v16.8h, v24.8h\n"
+        "add %x[in0], %x[in0], #0x40\n"
+        "mov v16.16b, v1.16b\n"
+        "mov v1.16b, v29.16b\n"
+        "fmla v1.8h, v16.8h, v25.8h\n"
+        "add %x[in1], %x[in1], #0x40\n"
+        "mov v16.16b, v0.16b\n"
+        "mov v0.16b, v30.16b\n"
+        "fmla v0.8h, v16.8h, v26.8h\n"
+        "add %x[out], %x[out], #0x40\n"
+        "mov v16.16b, v23.16b\n"
+        "mov v23.16b, v31.16b\n"
+        "fmla v23.8h, v16.8h, v27.8h\n"
+        "mov v16.16b, v22.16b\n"
+        "mov v22.16b, v28.16b\n"
+        "fmla v22.8h, v16.8h, v24.8h\n"
+        "mov v16.16b, v21.16b\n"
+        "mov v21.16b, v29.16b\n"
+        "fmla v21.8h, v16.8h, v25.8h\n"
+        "mov v16.16b, v20.16b\n"
+        "mov v20.16b, v30.16b\n"
+        "fmla v20.8h, v16.8h, v26.8h\n"
+        "mov v16.16b, v19.16b\n"
+        "mov v19.16b, v31.16b\n"
+        "fmla v19.8h, v16.8h, v27.8h\n"
+        "fmin v2.8h, v2.8h, v12.8h\n"
+        "fmin v1.8h, v1.8h, v12.8h\n"
+        "fmin v0.8h, v0.8h, v12.8h\n"
+        "fmin v23.8h, v23.8h, v12.8h\n"
+        "fmin v22.8h, v22.8h, v12.8h\n"
+        "fmin v21.8h, v21.8h, v12.8h\n"
+        "fmin v20.8h, v20.8h, v12.8h\n"
+        "fmin v19.8h, v19.8h, v12.8h\n"
+        "fmax v2.8h, v2.8h, v13.8h\n"
+        "fmax v1.8h, v1.8h, v13.8h\n"
+        "str q2, [x26, #0x0]\n"
+        "fmax v0.8h, v0.8h, v13.8h\n"
+        "fmax v23.8h, v23.8h, v13.8h\n"
+        "str q1, [x26, #0x10]\n"
+        "fmax v22.8h, v22.8h, v13.8h\n"
+        "fmax v21.8h, v21.8h, v13.8h\n"
+        "str q0, [x26, #0x20]\n"
+        "fmax v20.8h, v20.8h, v13.8h\n"
+        "fmax v19.8h, v19.8h, v13.8h\n"
+        "str q23, [x26, #0x30]\n"
+        "add x26, x26, #0x40\n"
+        "str q22, [x22, #0x0]\n"
+        "str q21, [x22, #0x10]\n"
+        "str q20, [x22, #0x20]\n"
+        "str q19, [x22, #0x30]\n"
+        "add x22, x22, #0x40\n"
+        "cbz %x[out_direct], 6f\n"
+        "add %x[out_direct], %x[out_direct], #0x40\n"
+        "6:" // No direct pointer update
+        "sub %x[width], %x[width], #0x20\n"
+        "cmp %x[width], #0x20\n"
+        "bge 1b\n"
+        "cbz %x[width], 58f\n"
+        "7:" // main loop skip
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x20, %x[height]\n"
+        "mov x12, %x[in0]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x11, %x[in1]\n"
+        "mov x10, %x[out]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x9, %x[out_direct]\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "8:" // tail loop: Row loop
+        "mov x28, x12\n"
+        "mov x27, x11\n"
+        "mov x26, x10\n"
+        "mov x25, x9\n"
+        "add x24, x28, %x[in0_stride]\n"
+        "add x23, x27, %x[in1_stride]\n"
+        "add x22, x26, %x[out_stride]\n"
+        "add x21, x25, %x[out_direct_stride]\n"
+        "cmp x20, #0x2\n"
+        "add x12, x24, %x[in0_stride]\n"
+        "add x11, x23, %x[in1_stride]\n"
+        "add x10, x22, %x[out_stride]\n"
+        "add x9, x21, %x[out_direct_stride]\n"
+        "csel x24, x24, x28, GE\n"
+        "csel x23, x23, x27, GE\n"
+        "csel x22, x22, x26, GE\n"
+        "csel x21, x21, x25, GE\n"
+        "tbz %x[width], #4, 16f\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q10, [x27, #0x0]\n"
+        "ldr q9, [x28, #0x10]\n"
+        "ldr q8, [x27, #0x10]\n"
+        "add x28, x28, #0x20\n"
+        "add x27, x27, #0x20\n"
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "ldr q21, [x24, #0x10]\n"
+        "ldr q20, [x23, #0x10]\n"
+        "add x24, x24, #0x20\n"
+        "add x23, x23, #0x20\n"
+        "tbz %x[width], #3, 12f\n"
+        "ldr q7, [x28, #0x0]\n"
+        "ldr q6, [x27, #0x0]\n"
+        "add x28, x28, #0x10\n"
+        "add x27, x27, #0x10\n"
+        "ldr q19, [x24, #0x0]\n"
+        "ldr q18, [x23, #0x0]\n"
+        "add x24, x24, #0x10\n"
+        "add x23, x23, #0x10\n"
+        "tbz %x[width], #2, 10f\n"
+        "ldr d5, [x28], #0x8\n"
+        "ldr d4, [x27], #0x8\n"
+        "ldr d17, [x24], #0x8\n"
+        "ldr d16, [x23], #0x8\n"
+        "tbz %x[width], #1, 9f\n"
+        "ld1 { v5.s }[2], [x28], #0x4\n"
+        "ld1 { v4.s }[2], [x27], #0x4\n"
+        "ld1 { v17.s }[2], [x24], #0x4\n"
+        "ld1 { v16.s }[2], [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v5.h }[6], [x28], #0x2\n"
+        "ld1 { v4.h }[6], [x27], #0x2\n"
+        "ld1 { v17.h }[6], [x24], #0x2\n"
+        "ld1 { v16.h }[6], [x23], #0x2\n"
+        "b 24f\n"
+        "9:" // tail loop: unique 1: partial_0_28
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v5.h }[4], [x28], #0x2\n"
+        "ld1 { v4.h }[4], [x27], #0x2\n"
+        "ld1 { v17.h }[4], [x24], #0x2\n"
+        "ld1 { v16.h }[4], [x23], #0x2\n"
+        "b 24f\n"
+        "10:" // tail loop: unique 1: partial_1_24
+        "tbz %x[width], #1, 11f\n"
+        "ldr s5, [x28], #0x4\n"
+        "ldr s4, [x27], #0x4\n"
+        "ldr s17, [x24], #0x4\n"
+        "ldr s16, [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v5.h }[2], [x28], #0x2\n"
+        "ld1 { v4.h }[2], [x27], #0x2\n"
+        "ld1 { v17.h }[2], [x24], #0x2\n"
+        "ld1 { v16.h }[2], [x23], #0x2\n"
+        "b 24f\n"
+        "11:" // tail loop: unique 1: partial_0_24
+        "tbz %x[width], #0, 24f\n"
+        "ldr h5, [x28], #0x2\n"
+        "ldr h4, [x27], #0x2\n"
+        "ldr h17, [x24], #0x2\n"
+        "ldr h16, [x23], #0x2\n"
+        "b 24f\n"
+        "12:" // tail loop: unique 1: partial_2_16
+        "tbz %x[width], #2, 14f\n"
+        "ldr d7, [x28], #0x8\n"
+        "ldr d6, [x27], #0x8\n"
+        "ldr d19, [x24], #0x8\n"
+        "ldr d18, [x23], #0x8\n"
+        "tbz %x[width], #1, 13f\n"
+        "ld1 { v7.s }[2], [x28], #0x4\n"
+        "ld1 { v6.s }[2], [x27], #0x4\n"
+        "ld1 { v19.s }[2], [x24], #0x4\n"
+        "ld1 { v18.s }[2], [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v7.h }[6], [x28], #0x2\n"
+        "ld1 { v6.h }[6], [x27], #0x2\n"
+        "ld1 { v19.h }[6], [x24], #0x2\n"
+        "ld1 { v18.h }[6], [x23], #0x2\n"
+        "b 24f\n"
+        "13:" // tail loop: unique 1: partial_0_20
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v7.h }[4], [x28], #0x2\n"
+        "ld1 { v6.h }[4], [x27], #0x2\n"
+        "ld1 { v19.h }[4], [x24], #0x2\n"
+        "ld1 { v18.h }[4], [x23], #0x2\n"
+        "b 24f\n"
+        "14:" // tail loop: unique 1: partial_1_16
+        "tbz %x[width], #1, 15f\n"
+        "ldr s7, [x28], #0x4\n"
+        "ldr s6, [x27], #0x4\n"
+        "ldr s19, [x24], #0x4\n"
+        "ldr s18, [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v7.h }[2], [x28], #0x2\n"
+        "ld1 { v6.h }[2], [x27], #0x2\n"
+        "ld1 { v19.h }[2], [x24], #0x2\n"
+        "ld1 { v18.h }[2], [x23], #0x2\n"
+        "b 24f\n"
+        "15:" // tail loop: unique 1: partial_0_16
+        "tbz %x[width], #0, 24f\n"
+        "ldr h7, [x28], #0x2\n"
+        "ldr h6, [x27], #0x2\n"
+        "ldr h19, [x24], #0x2\n"
+        "ldr h18, [x23], #0x2\n"
+        "b 24f\n"
+        "16:" // tail loop: unique 1: partial_3_0
+        "tbz %x[width], #3, 20f\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q10, [x27, #0x0]\n"
+        "add x28, x28, #0x10\n"
+        "add x27, x27, #0x10\n"
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "add x24, x24, #0x10\n"
+        "add x23, x23, #0x10\n"
+        "tbz %x[width], #2, 18f\n"
+        "ldr d9, [x28], #0x8\n"
+        "ldr d8, [x27], #0x8\n"
+        "ldr d21, [x24], #0x8\n"
+        "ldr d20, [x23], #0x8\n"
+        "tbz %x[width], #1, 17f\n"
+        "ld1 { v9.s }[2], [x28], #0x4\n"
+        "ld1 { v8.s }[2], [x27], #0x4\n"
+        "ld1 { v21.s }[2], [x24], #0x4\n"
+        "ld1 { v20.s }[2], [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v9.h }[6], [x28], #0x2\n"
+        "ld1 { v8.h }[6], [x27], #0x2\n"
+        "ld1 { v21.h }[6], [x24], #0x2\n"
+        "ld1 { v20.h }[6], [x23], #0x2\n"
+        "b 24f\n"
+        "17:" // tail loop: unique 1: partial_0_12
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v9.h }[4], [x28], #0x2\n"
+        "ld1 { v8.h }[4], [x27], #0x2\n"
+        "ld1 { v21.h }[4], [x24], #0x2\n"
+        "ld1 { v20.h }[4], [x23], #0x2\n"
+        "b 24f\n"
+        "18:" // tail loop: unique 1: partial_1_8
+        "tbz %x[width], #1, 19f\n"
+        "ldr s9, [x28], #0x4\n"
+        "ldr s8, [x27], #0x4\n"
+        "ldr s21, [x24], #0x4\n"
+        "ldr s20, [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v9.h }[2], [x28], #0x2\n"
+        "ld1 { v8.h }[2], [x27], #0x2\n"
+        "ld1 { v21.h }[2], [x24], #0x2\n"
+        "ld1 { v20.h }[2], [x23], #0x2\n"
+        "b 24f\n"
+        "19:" // tail loop: unique 1: partial_0_8
+        "tbz %x[width], #0, 24f\n"
+        "ldr h9, [x28], #0x2\n"
+        "ldr h8, [x27], #0x2\n"
+        "ldr h21, [x24], #0x2\n"
+        "ldr h20, [x23], #0x2\n"
+        "b 24f\n"
+        "20:" // tail loop: unique 1: partial_2_0
+        "tbz %x[width], #2, 22f\n"
+        "ldr d11, [x28], #0x8\n"
+        "ldr d10, [x27], #0x8\n"
+        "ldr d3, [x24], #0x8\n"
+        "ldr d22, [x23], #0x8\n"
+        "tbz %x[width], #1, 21f\n"
+        "ld1 { v11.s }[2], [x28], #0x4\n"
+        "ld1 { v10.s }[2], [x27], #0x4\n"
+        "ld1 { v3.s }[2], [x24], #0x4\n"
+        "ld1 { v22.s }[2], [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v11.h }[6], [x28], #0x2\n"
+        "ld1 { v10.h }[6], [x27], #0x2\n"
+        "ld1 { v3.h }[6], [x24], #0x2\n"
+        "ld1 { v22.h }[6], [x23], #0x2\n"
+        "b 24f\n"
+        "21:" // tail loop: unique 1: partial_0_4
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v11.h }[4], [x28], #0x2\n"
+        "ld1 { v10.h }[4], [x27], #0x2\n"
+        "ld1 { v3.h }[4], [x24], #0x2\n"
+        "ld1 { v22.h }[4], [x23], #0x2\n"
+        "b 24f\n"
+        "22:" // tail loop: unique 1: partial_1_0
+        "tbz %x[width], #1, 23f\n"
+        "ldr s11, [x28], #0x4\n"
+        "ldr s10, [x27], #0x4\n"
+        "ldr s3, [x24], #0x4\n"
+        "ldr s22, [x23], #0x4\n"
+        "tbz %x[width], #0, 24f\n"
+        "ld1 { v11.h }[2], [x28], #0x2\n"
+        "ld1 { v10.h }[2], [x27], #0x2\n"
+        "ld1 { v3.h }[2], [x24], #0x2\n"
+        "ld1 { v22.h }[2], [x23], #0x2\n"
+        "b 24f\n"
+        "23:" // tail loop: unique 1: partial_0_0
+        "ldr h11, [x28], #0x2\n"
+        "ldr h10, [x27], #0x2\n"
+        "ldr h3, [x24], #0x2\n"
+        "ldr h22, [x23], #0x2\n"
+        "24:" // tail loop: unique 1: Done
+        "fadd v2.8h, v11.8h, v10.8h\n"
+        "fadd v1.8h, v9.8h, v8.8h\n"
+        "fadd v0.8h, v7.8h, v6.8h\n"
+        "fadd v23.8h, v5.8h, v4.8h\n"
+        "fadd v22.8h, v3.8h, v22.8h\n"
+        "fadd v21.8h, v21.8h, v20.8h\n"
+        "fadd v20.8h, v19.8h, v18.8h\n"
+        "fadd v19.8h, v17.8h, v16.8h\n"
+        "cbz %x[out_direct], 41f\n"
+        "tbz %x[width], #4, 32f\n"
+        "str q2, [x25, #0x0]\n"
+        "str q1, [x25, #0x10]\n"
+        "add x25, x25, #0x20\n"
+        "str q22, [x21, #0x0]\n"
+        "str q21, [x21, #0x10]\n"
+        "add x21, x21, #0x20\n"
+        "tbz %x[width], #3, 28f\n"
+        "str q0, [x25, #0x0]\n"
+        "add x25, x25, #0x10\n"
+        "str q20, [x21, #0x0]\n"
+        "add x21, x21, #0x10\n"
+        "tbz %x[width], #2, 26f\n"
+        "str d23, [x25], #0x8\n"
+        "str d19, [x21], #0x8\n"
+        "tbz %x[width], #1, 25f\n"
+        "st1 { v23.s }[2], [x25], #0x4\n"
+        "st1 { v19.s }[2], [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v23.h }[6], [x25], #0x2\n"
+        "st1 { v19.h }[6], [x21], #0x2\n"
+        "b 40f\n"
+        "25:" // tail loop: Main loop: unique 2: partial_0_28
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v23.h }[4], [x25], #0x2\n"
+        "st1 { v19.h }[4], [x21], #0x2\n"
+        "b 40f\n"
+        "26:" // tail loop: Main loop: unique 2: partial_1_24
+        "tbz %x[width], #1, 27f\n"
+        "str s23, [x25], #0x4\n"
+        "str s19, [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v23.h }[2], [x25], #0x2\n"
+        "st1 { v19.h }[2], [x21], #0x2\n"
+        "b 40f\n"
+        "27:" // tail loop: Main loop: unique 2: partial_0_24
+        "tbz %x[width], #0, 40f\n"
+        "str h23, [x25], #0x2\n"
+        "str h19, [x21], #0x2\n"
+        "b 40f\n"
+        "28:" // tail loop: Main loop: unique 2: partial_2_16
+        "tbz %x[width], #2, 30f\n"
+        "str d0, [x25], #0x8\n"
+        "str d20, [x21], #0x8\n"
+        "tbz %x[width], #1, 29f\n"
+        "st1 { v0.s }[2], [x25], #0x4\n"
+        "st1 { v20.s }[2], [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v0.h }[6], [x25], #0x2\n"
+        "st1 { v20.h }[6], [x21], #0x2\n"
+        "b 40f\n"
+        "29:" // tail loop: Main loop: unique 2: partial_0_20
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v0.h }[4], [x25], #0x2\n"
+        "st1 { v20.h }[4], [x21], #0x2\n"
+        "b 40f\n"
+        "30:" // tail loop: Main loop: unique 2: partial_1_16
+        "tbz %x[width], #1, 31f\n"
+        "str s0, [x25], #0x4\n"
+        "str s20, [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v0.h }[2], [x25], #0x2\n"
+        "st1 { v20.h }[2], [x21], #0x2\n"
+        "b 40f\n"
+        "31:" // tail loop: Main loop: unique 2: partial_0_16
+        "tbz %x[width], #0, 40f\n"
+        "str h0, [x25], #0x2\n"
+        "str h20, [x21], #0x2\n"
+        "b 40f\n"
+        "32:" // tail loop: Main loop: unique 2: partial_3_0
+        "tbz %x[width], #3, 36f\n"
+        "str q2, [x25, #0x0]\n"
+        "add x25, x25, #0x10\n"
+        "str q22, [x21, #0x0]\n"
+        "add x21, x21, #0x10\n"
+        "tbz %x[width], #2, 34f\n"
+        "str d1, [x25], #0x8\n"
+        "str d21, [x21], #0x8\n"
+        "tbz %x[width], #1, 33f\n"
+        "st1 { v1.s }[2], [x25], #0x4\n"
+        "st1 { v21.s }[2], [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v1.h }[6], [x25], #0x2\n"
+        "st1 { v21.h }[6], [x21], #0x2\n"
+        "b 40f\n"
+        "33:" // tail loop: Main loop: unique 2: partial_0_12
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v1.h }[4], [x25], #0x2\n"
+        "st1 { v21.h }[4], [x21], #0x2\n"
+        "b 40f\n"
+        "34:" // tail loop: Main loop: unique 2: partial_1_8
+        "tbz %x[width], #1, 35f\n"
+        "str s1, [x25], #0x4\n"
+        "str s21, [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v1.h }[2], [x25], #0x2\n"
+        "st1 { v21.h }[2], [x21], #0x2\n"
+        "b 40f\n"
+        "35:" // tail loop: Main loop: unique 2: partial_0_8
+        "tbz %x[width], #0, 40f\n"
+        "str h1, [x25], #0x2\n"
+        "str h21, [x21], #0x2\n"
+        "b 40f\n"
+        "36:" // tail loop: Main loop: unique 2: partial_2_0
+        "tbz %x[width], #2, 38f\n"
+        "str d2, [x25], #0x8\n"
+        "str d22, [x21], #0x8\n"
+        "tbz %x[width], #1, 37f\n"
+        "st1 { v2.s }[2], [x25], #0x4\n"
+        "st1 { v22.s }[2], [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v2.h }[6], [x25], #0x2\n"
+        "st1 { v22.h }[6], [x21], #0x2\n"
+        "b 40f\n"
+        "37:" // tail loop: Main loop: unique 2: partial_0_4
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v2.h }[4], [x25], #0x2\n"
+        "st1 { v22.h }[4], [x21], #0x2\n"
+        "b 40f\n"
+        "38:" // tail loop: Main loop: unique 2: partial_1_0
+        "tbz %x[width], #1, 39f\n"
+        "str s2, [x25], #0x4\n"
+        "str s22, [x21], #0x4\n"
+        "tbz %x[width], #0, 40f\n"
+        "st1 { v2.h }[2], [x25], #0x2\n"
+        "st1 { v22.h }[2], [x21], #0x2\n"
+        "b 40f\n"
+        "39:" // tail loop: Main loop: unique 2: partial_0_0
+        "str h2, [x25], #0x2\n"
+        "str h22, [x21], #0x2\n"
+        "40:" // tail loop: Main loop: unique 2: Done
+        "41:" // tail loop: Main loop: No direct output
+        "mov v16.16b, v2.16b\n"
+        "mov v2.16b, v28.16b\n"
+        "fmla v2.8h, v16.8h, v24.8h\n"
+        "mov v16.16b, v1.16b\n"
+        "mov v1.16b, v29.16b\n"
+        "fmla v1.8h, v16.8h, v25.8h\n"
+        "mov v16.16b, v0.16b\n"
+        "mov v0.16b, v30.16b\n"
+        "fmla v0.8h, v16.8h, v26.8h\n"
+        "mov v16.16b, v23.16b\n"
+        "mov v23.16b, v31.16b\n"
+        "fmla v23.8h, v16.8h, v27.8h\n"
+        "mov v16.16b, v22.16b\n"
+        "mov v22.16b, v28.16b\n"
+        "fmla v22.8h, v16.8h, v24.8h\n"
+        "mov v16.16b, v21.16b\n"
+        "mov v21.16b, v29.16b\n"
+        "fmla v21.8h, v16.8h, v25.8h\n"
+        "mov v16.16b, v20.16b\n"
+        "mov v20.16b, v30.16b\n"
+        "fmla v20.8h, v16.8h, v26.8h\n"
+        "mov v16.16b, v19.16b\n"
+        "mov v19.16b, v31.16b\n"
+        "fmla v19.8h, v16.8h, v27.8h\n"
+        "fmin v2.8h, v2.8h, v12.8h\n"
+        "fmin v1.8h, v1.8h, v12.8h\n"
+        "fmin v0.8h, v0.8h, v12.8h\n"
+        "fmin v23.8h, v23.8h, v12.8h\n"
+        "fmin v22.8h, v22.8h, v12.8h\n"
+        "fmin v21.8h, v21.8h, v12.8h\n"
+        "fmin v20.8h, v20.8h, v12.8h\n"
+        "fmin v19.8h, v19.8h, v12.8h\n"
+        "fmax v2.8h, v2.8h, v13.8h\n"
+        "fmax v1.8h, v1.8h, v13.8h\n"
+        "fmax v0.8h, v0.8h, v13.8h\n"
+        "fmax v23.8h, v23.8h, v13.8h\n"
+        "fmax v22.8h, v22.8h, v13.8h\n"
+        "fmax v21.8h, v21.8h, v13.8h\n"
+        "fmax v20.8h, v20.8h, v13.8h\n"
+        "fmax v19.8h, v19.8h, v13.8h\n"
+        "tbz %x[width], #4, 49f\n"
+        "str q2, [x26, #0x0]\n"
+        "str q1, [x26, #0x10]\n"
+        "add x26, x26, #0x20\n"
+        "str q22, [x22, #0x0]\n"
+        "str q21, [x22, #0x10]\n"
+        "add x22, x22, #0x20\n"
+        "tbz %x[width], #3, 45f\n"
+        "str q0, [x26, #0x0]\n"
+        "add x26, x26, #0x10\n"
+        "str q20, [x22, #0x0]\n"
+        "add x22, x22, #0x10\n"
+        "tbz %x[width], #2, 43f\n"
+        "str d23, [x26], #0x8\n"
+        "str d19, [x22], #0x8\n"
+        "tbz %x[width], #1, 42f\n"
+        "st1 { v23.s }[2], [x26], #0x4\n"
+        "st1 { v19.s }[2], [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v23.h }[6], [x26], #0x2\n"
+        "st1 { v19.h }[6], [x22], #0x2\n"
+        "b 57f\n"
+        "42:" // tail loop: unique 3: partial_0_28
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v23.h }[4], [x26], #0x2\n"
+        "st1 { v19.h }[4], [x22], #0x2\n"
+        "b 57f\n"
+        "43:" // tail loop: unique 3: partial_1_24
+        "tbz %x[width], #1, 44f\n"
+        "str s23, [x26], #0x4\n"
+        "str s19, [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v23.h }[2], [x26], #0x2\n"
+        "st1 { v19.h }[2], [x22], #0x2\n"
+        "b 57f\n"
+        "44:" // tail loop: unique 3: partial_0_24
+        "tbz %x[width], #0, 57f\n"
+        "str h23, [x26], #0x2\n"
+        "str h19, [x22], #0x2\n"
+        "b 57f\n"
+        "45:" // tail loop: unique 3: partial_2_16
+        "tbz %x[width], #2, 47f\n"
+        "str d0, [x26], #0x8\n"
+        "str d20, [x22], #0x8\n"
+        "tbz %x[width], #1, 46f\n"
+        "st1 { v0.s }[2], [x26], #0x4\n"
+        "st1 { v20.s }[2], [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v0.h }[6], [x26], #0x2\n"
+        "st1 { v20.h }[6], [x22], #0x2\n"
+        "b 57f\n"
+        "46:" // tail loop: unique 3: partial_0_20
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v0.h }[4], [x26], #0x2\n"
+        "st1 { v20.h }[4], [x22], #0x2\n"
+        "b 57f\n"
+        "47:" // tail loop: unique 3: partial_1_16
+        "tbz %x[width], #1, 48f\n"
+        "str s0, [x26], #0x4\n"
+        "str s20, [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v0.h }[2], [x26], #0x2\n"
+        "st1 { v20.h }[2], [x22], #0x2\n"
+        "b 57f\n"
+        "48:" // tail loop: unique 3: partial_0_16
+        "tbz %x[width], #0, 57f\n"
+        "str h0, [x26], #0x2\n"
+        "str h20, [x22], #0x2\n"
+        "b 57f\n"
+        "49:" // tail loop: unique 3: partial_3_0
+        "tbz %x[width], #3, 53f\n"
+        "str q2, [x26, #0x0]\n"
+        "add x26, x26, #0x10\n"
+        "str q22, [x22, #0x0]\n"
+        "add x22, x22, #0x10\n"
+        "tbz %x[width], #2, 51f\n"
+        "str d1, [x26], #0x8\n"
+        "str d21, [x22], #0x8\n"
+        "tbz %x[width], #1, 50f\n"
+        "st1 { v1.s }[2], [x26], #0x4\n"
+        "st1 { v21.s }[2], [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v1.h }[6], [x26], #0x2\n"
+        "st1 { v21.h }[6], [x22], #0x2\n"
+        "b 57f\n"
+        "50:" // tail loop: unique 3: partial_0_12
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v1.h }[4], [x26], #0x2\n"
+        "st1 { v21.h }[4], [x22], #0x2\n"
+        "b 57f\n"
+        "51:" // tail loop: unique 3: partial_1_8
+        "tbz %x[width], #1, 52f\n"
+        "str s1, [x26], #0x4\n"
+        "str s21, [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v1.h }[2], [x26], #0x2\n"
+        "st1 { v21.h }[2], [x22], #0x2\n"
+        "b 57f\n"
+        "52:" // tail loop: unique 3: partial_0_8
+        "tbz %x[width], #0, 57f\n"
+        "str h1, [x26], #0x2\n"
+        "str h21, [x22], #0x2\n"
+        "b 57f\n"
+        "53:" // tail loop: unique 3: partial_2_0
+        "tbz %x[width], #2, 55f\n"
+        "str d2, [x26], #0x8\n"
+        "str d22, [x22], #0x8\n"
+        "tbz %x[width], #1, 54f\n"
+        "st1 { v2.s }[2], [x26], #0x4\n"
+        "st1 { v22.s }[2], [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v2.h }[6], [x26], #0x2\n"
+        "st1 { v22.h }[6], [x22], #0x2\n"
+        "b 57f\n"
+        "54:" // tail loop: unique 3: partial_0_4
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v2.h }[4], [x26], #0x2\n"
+        "st1 { v22.h }[4], [x22], #0x2\n"
+        "b 57f\n"
+        "55:" // tail loop: unique 3: partial_1_0
+        "tbz %x[width], #1, 56f\n"
+        "str s2, [x26], #0x4\n"
+        "str s22, [x22], #0x4\n"
+        "tbz %x[width], #0, 57f\n"
+        "st1 { v2.h }[2], [x26], #0x2\n"
+        "st1 { v22.h }[2], [x22], #0x2\n"
+        "b 57f\n"
+        "56:" // tail loop: unique 3: partial_0_0
+        "str h2, [x26], #0x2\n"
+        "str h22, [x22], #0x2\n"
+        "57:" // tail loop: unique 3: Done
+        "subs x20, x20, #0x2\n"
+        "bgt 8b\n"
+        "58:" // odd columns skip
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+}
+
+} // namespace
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_mul_add_fp16_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    const size_t out_stride        = final_output->info()->strides_in_bytes()[1];
+    const size_t out_direct_stride = (add_output != nullptr) ? add_output->info()->strides_in_bytes()[1] : 0;
+    const size_t in0_stride        = input1->info()->strides_in_bytes()[1];
+    const size_t in1_stride        = input2->info()->strides_in_bytes()[1];
+
+    float16_t minval = std::numeric_limits<half>::lowest();
+    float16_t maxval = std::numeric_limits<half>::max();
+
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    {
+        minval = static_cast<float16_t>(0.f);
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    {
+        minval = static_cast<float16_t>(0.f);
+        maxval = static_cast<float16_t>(act_info.a());
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    {
+        minval = static_cast<float16_t>(act_info.b());
+        maxval = static_cast<float16_t>(act_info.a());
+    }
+
+    // Clear X & Y dimensions on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator in1_it(input1, window);
+    Iterator in2_it(input2, window);
+    Iterator out_it(final_output, window);
+
+    const size_t width  = window.num_iterations(0);
+    const size_t height = window.num_iterations(1);
+
+    if (add_output != nullptr)
+    {
+        Iterator add_out_it(add_output, window);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
+                                                  reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
+                                                  reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
+                                                  reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr,
+                                                  out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()),
+                                                  in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, out_it);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..f0444b6acd
--- /dev/null
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
@@ -0,0 +1,733 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#ifdef __aarch64__
+namespace
+{
+void a64_add_bn_clamp_direct_fp32_2x16(float       *out,
+                                       size_t       out_stride,
+                                       float       *out_direct,
+                                       size_t       out_direct_stride,
+                                       const float *in0,
+                                       size_t       in0_stride,
+                                       const float *in1,
+                                       size_t       in1_stride,
+                                       const float *bn_mul,
+                                       const float *bn_add,
+                                       const float  minval,
+                                       const float  maxval,
+                                       size_t       width,
+                                       size_t       height)
+{
+    struct KernelArgs
+    {
+        float minval;
+        float maxval;
+    } ka;
+    ka.minval = minval;
+    ka.maxval = maxval;
+
+    __asm__ __volatile__(
+        "ldr w21, [%x[args_ptr], %[offsetof_minval]]\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_maxval]]\n"
+        "cmp %x[width], #0x10\n"
+        "dup v13.4s, w21\n"
+        "dup v12.4s, w20\n"
+        "blt 7f\n"
+        "1:" // Column loop
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x12, %x[in0]\n"
+        "mov x11, %x[in1]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x10, %x[out]\n"
+        "mov x9, %x[out_direct]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x20, %x[height]\n"
+        "mov x28, x12\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "mov x27, x11\n"
+        "mov x26, x10\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q10, [x27, #0x0]\n"
+        "mov x25, x9\n"
+        "add x24, x28, %x[in0_stride]\n"
+        "ldr q9, [x28, #0x10]\n"
+        "ldr q8, [x27, #0x10]\n"
+        "add x23, x27, %x[in1_stride]\n"
+        "add x22, x26, %x[out_stride]\n"
+        "ldr q7, [x28, #0x20]\n"
+        "ldr q6, [x27, #0x20]\n"
+        "add x21, x25, %x[out_direct_stride]\n"
+        "cmp x20, #0x2\n"
+        "ldr q5, [x28, #0x30]\n"
+        "ldr q4, [x27, #0x30]\n"
+        "add x12, x24, %x[in0_stride]\n"
+        "add x11, x23, %x[in1_stride]\n"
+        "add x10, x22, %x[out_stride]\n"
+        "add x9, x21, %x[out_direct_stride]\n"
+        "csel x24, x24, x28, GE\n"
+        "csel x23, x23, x27, GE\n"
+        "csel x22, x22, x26, GE\n"
+        "csel x21, x21, x25, GE\n"
+        "subs x20, x20, #0x2\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "add x28, x28, #0x40\n"
+        "add x27, x27, #0x40\n"
+        "ble 4f\n"
+        "2:" // Row loop
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "fadd v2.4s, v11.4s, v10.4s\n"
+        "fadd v1.4s, v9.4s, v8.4s\n"
+        "ldr q21, [x24, #0x10]\n"
+        "ldr q20, [x23, #0x10]\n"
+        "fadd v0.4s, v7.4s, v6.4s\n"
+        "fadd v23.4s, v5.4s, v4.4s\n"
+        "ldr q19, [x24, #0x20]\n"
+        "ldr q18, [x23, #0x20]\n"
+        "fadd v22.4s, v3.4s, v22.4s\n"
+        "fadd v21.4s, v21.4s, v20.4s\n"
+        "ldr q17, [x24, #0x30]\n"
+        "ldr q16, [x23, #0x30]\n"
+        "fadd v20.4s, v19.4s, v18.4s\n"
+        "fadd v19.4s, v17.4s, v16.4s\n"
+        "add x24, x24, #0x40\n"
+        "add x23, x23, #0x40\n"
+        "cbz %x[out_direct], 3f\n"
+        "str q2, [x25, #0x0]\n"
+        "str q1, [x25, #0x10]\n"
+        "str q0, [x25, #0x20]\n"
+        "str q23, [x25, #0x30]\n"
+        "add x25, x25, #0x40\n"
+        "str q22, [x21, #0x0]\n"
+        "str q21, [x21, #0x10]\n"
+        "str q20, [x21, #0x20]\n"
+        "str q19, [x21, #0x30]\n"
+        "add x21, x21, #0x40\n"
+        "3:" // Main loop: No direct output
+        "mov v16.16b, v2.16b\n"
+        "mov v2.16b, v28.16b\n"
+        "fmla v2.4s, v16.4s, v24.4s\n"
+        "mov x28, x12\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q9, [x28, #0x10]\n"
+        "mov v18.16b, v1.16b\n"
+        "mov v1.16b, v29.16b\n"
+        "ldr q7, [x28, #0x20]\n"
+        "ldr q5, [x28, #0x30]\n"
+        "mov v17.16b, v0.16b\n"
+        "mov v0.16b, v30.16b\n"
+        "mov v16.16b, v23.16b\n"
+        "mov v23.16b, v31.16b\n"
+        "fmla v1.4s, v18.4s, v25.4s\n"
+        "mov x27, x11\n"
+        "ldr q10, [x27, #0x0]\n"
+        "ldr q8, [x27, #0x10]\n"
+        "fmla v0.4s, v17.4s, v26.4s\n"
+        "fmla v23.4s, v16.4s, v27.4s\n"
+        "ldr q6, [x27, #0x20]\n"
+        "ldr q4, [x27, #0x30]\n"
+        "mov v17.16b, v22.16b\n"
+        "mov v22.16b, v28.16b\n"
+        "mov v16.16b, v21.16b\n"
+        "mov v21.16b, v29.16b\n"
+        "fmla v22.4s, v17.4s, v24.4s\n"
+        "mov x25, x9\n"
+        "mov v17.16b, v20.16b\n"
+        "mov v20.16b, v30.16b\n"
+        "fmla v21.4s, v16.4s, v25.4s\n"
+        "add x24, x28, %x[in0_stride]\n"
+        "mov v16.16b, v19.16b\n"
+        "mov v19.16b, v31.16b\n"
+        "fmla v20.4s, v17.4s, v26.4s\n"
+        "add x23, x27, %x[in1_stride]\n"
+        "fmla v19.4s, v16.4s, v27.4s\n"
+        "fmin v2.4s, v2.4s, v12.4s\n"
+        "add x21, x25, %x[out_direct_stride]\n"
+        "cmp x20, #0x2\n"
+        "fmin v1.4s, v1.4s, v12.4s\n"
+        "fmin v0.4s, v0.4s, v12.4s\n"
+        "add x12, x24, %x[in0_stride]\n"
+        "add x11, x23, %x[in1_stride]\n"
+        "fmin v23.4s, v23.4s, v12.4s\n"
+        "fmax v2.4s, v2.4s, v13.4s\n"
+        "str q2, [x26, #0x0]\n"
+        "add x9, x21, %x[out_direct_stride]\n"
+        "fmax v1.4s, v1.4s, v13.4s\n"
+        "fmax v0.4s, v0.4s, v13.4s\n"
+        "str q1, [x26, #0x10]\n"
+        "csel x24, x24, x28, GE\n"
+        "fmax v23.4s, v23.4s, v13.4s\n"
+        "fmin v22.4s, v22.4s, v12.4s\n"
+        "str q0, [x26, #0x20]\n"
+        "csel x23, x23, x27, GE\n"
+        "fmin v21.4s, v21.4s, v12.4s\n"
+        "fmin v20.4s, v20.4s, v12.4s\n"
+        "str q23, [x26, #0x30]\n"
+        "mov x26, x10\n"
+        "fmin v19.4s, v19.4s, v12.4s\n"
+        "fmax v22.4s, v22.4s, v13.4s\n"
+        "str q22, [x22, #0x0]\n"
+        "csel x21, x21, x25, GE\n"
+        "fmax v21.4s, v21.4s, v13.4s\n"
+        "fmax v20.4s, v20.4s, v13.4s\n"
+        "str q21, [x22, #0x10]\n"
+        "add x28, x28, #0x40\n"
+        "fmax v19.4s, v19.4s, v13.4s\n"
+        "str q20, [x22, #0x20]\n"
+        "add x27, x27, #0x40\n"
+        "str q19, [x22, #0x30]\n"
+        "add x22, x26, %x[out_stride]\n"
+        "add x10, x22, %x[out_stride]\n"
+        "csel x22, x22, x26, GE\n"
+        "subs x20, x20, #0x2\n"
+        "bgt 2b\n"
+        "4:" // Row loop skip
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "fadd v2.4s, v11.4s, v10.4s\n"
+        "fadd v1.4s, v9.4s, v8.4s\n"
+        "ldr q21, [x24, #0x10]\n"
+        "ldr q20, [x23, #0x10]\n"
+        "fadd v0.4s, v7.4s, v6.4s\n"
+        "fadd v23.4s, v5.4s, v4.4s\n"
+        "ldr q19, [x24, #0x20]\n"
+        "ldr q18, [x23, #0x20]\n"
+        "fadd v22.4s, v3.4s, v22.4s\n"
+        "fadd v21.4s, v21.4s, v20.4s\n"
+        "ldr q17, [x24, #0x30]\n"
+        "ldr q16, [x23, #0x30]\n"
+        "fadd v20.4s, v19.4s, v18.4s\n"
+        "fadd v19.4s, v17.4s, v16.4s\n"
+        "add x24, x24, #0x40\n"
+        "add x23, x23, #0x40\n"
+        "cbz %x[out_direct], 5f\n"
+        "str q2, [x25, #0x0]\n"
+        "str q1, [x25, #0x10]\n"
+        "str q0, [x25, #0x20]\n"
+        "str q23, [x25, #0x30]\n"
+        "add x25, x25, #0x40\n"
+        "str q22, [x21, #0x0]\n"
+        "str q21, [x21, #0x10]\n"
+        "str q20, [x21, #0x20]\n"
+        "str q19, [x21, #0x30]\n"
+        "add x21, x21, #0x40\n"
+        "5:" // Tail loop: No direct output
+        "mov v16.16b, v2.16b\n"
+        "mov v2.16b, v28.16b\n"
+        "fmla v2.4s, v16.4s, v24.4s\n"
+        "add %x[in0], %x[in0], #0x40\n"
+        "mov v16.16b, v1.16b\n"
+        "mov v1.16b, v29.16b\n"
+        "fmla v1.4s, v16.4s, v25.4s\n"
+        "add %x[in1], %x[in1], #0x40\n"
+        "mov v16.16b, v0.16b\n"
+        "mov v0.16b, v30.16b\n"
+        "fmla v0.4s, v16.4s, v26.4s\n"
+        "add %x[out], %x[out], #0x40\n"
+        "mov v16.16b, v23.16b\n"
+        "mov v23.16b, v31.16b\n"
+        "fmla v23.4s, v16.4s, v27.4s\n"
+        "mov v16.16b, v22.16b\n"
+        "mov v22.16b, v28.16b\n"
+        "fmla v22.4s, v16.4s, v24.4s\n"
+        "mov v16.16b, v21.16b\n"
+        "mov v21.16b, v29.16b\n"
+        "fmla v21.4s, v16.4s, v25.4s\n"
+        "mov v16.16b, v20.16b\n"
+        "mov v20.16b, v30.16b\n"
+        "fmla v20.4s, v16.4s, v26.4s\n"
+        "mov v16.16b, v19.16b\n"
+        "mov v19.16b, v31.16b\n"
+        "fmla v19.4s, v16.4s, v27.4s\n"
+        "fmin v2.4s, v2.4s, v12.4s\n"
+        "fmin v1.4s, v1.4s, v12.4s\n"
+        "fmin v0.4s, v0.4s, v12.4s\n"
+        "fmin v23.4s, v23.4s, v12.4s\n"
+        "fmin v22.4s, v22.4s, v12.4s\n"
+        "fmin v21.4s, v21.4s, v12.4s\n"
+        "fmin v20.4s, v20.4s, v12.4s\n"
+        "fmin v19.4s, v19.4s, v12.4s\n"
+        "fmax v2.4s, v2.4s, v13.4s\n"
+        "fmax v1.4s, v1.4s, v13.4s\n"
+        "str q2, [x26, #0x0]\n"
+        "fmax v0.4s, v0.4s, v13.4s\n"
+        "fmax v23.4s, v23.4s, v13.4s\n"
+        "str q1, [x26, #0x10]\n"
+        "fmax v22.4s, v22.4s, v13.4s\n"
+        "fmax v21.4s, v21.4s, v13.4s\n"
+        "str q0, [x26, #0x20]\n"
+        "fmax v20.4s, v20.4s, v13.4s\n"
+        "fmax v19.4s, v19.4s, v13.4s\n"
+        "str q23, [x26, #0x30]\n"
+        "add x26, x26, #0x40\n"
+        "str q22, [x22, #0x0]\n"
+        "str q21, [x22, #0x10]\n"
+        "str q20, [x22, #0x20]\n"
+        "str q19, [x22, #0x30]\n"
+        "add x22, x22, #0x40\n"
+        "cbz %x[out_direct], 6f\n"
+        "add %x[out_direct], %x[out_direct], #0x40\n"
+        "6:" // No direct pointer update
+        "sub %x[width], %x[width], #0x10\n"
+        "cmp %x[width], #0x10\n"
+        "bge 1b\n"
+        "cbz %x[width], 34f\n"
+        "7:" // main loop skip
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x20, %x[height]\n"
+        "mov x12, %x[in0]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x11, %x[in1]\n"
+        "mov x10, %x[out]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x9, %x[out_direct]\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "8:" // tail loop: Row loop
+        "mov x28, x12\n"
+        "mov x27, x11\n"
+        "mov x26, x10\n"
+        "mov x25, x9\n"
+        "add x24, x28, %x[in0_stride]\n"
+        "add x23, x27, %x[in1_stride]\n"
+        "add x22, x26, %x[out_stride]\n"
+        "add x21, x25, %x[out_direct_stride]\n"
+        "cmp x20, #0x2\n"
+        "add x12, x24, %x[in0_stride]\n"
+        "add x11, x23, %x[in1_stride]\n"
+        "add x10, x22, %x[out_stride]\n"
+        "add x9, x21, %x[out_direct_stride]\n"
+        "csel x24, x24, x28, GE\n"
+        "csel x23, x23, x27, GE\n"
+        "csel x22, x22, x26, GE\n"
+        "csel x21, x21, x25, GE\n"
+        "tbz %x[width], #3, 12f\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q10, [x27, #0x0]\n"
+        "ldr q9, [x28, #0x10]\n"
+        "ldr q8, [x27, #0x10]\n"
+        "add x28, x28, #0x20\n"
+        "add x27, x27, #0x20\n"
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "ldr q21, [x24, #0x10]\n"
+        "ldr q20, [x23, #0x10]\n"
+        "add x24, x24, #0x20\n"
+        "add x23, x23, #0x20\n"
+        "tbz %x[width], #2, 10f\n"
+        "ldr q7, [x28, #0x0]\n"
+        "ldr q6, [x27, #0x0]\n"
+        "add x28, x28, #0x10\n"
+        "add x27, x27, #0x10\n"
+        "ldr q19, [x24, #0x0]\n"
+        "ldr q18, [x23, #0x0]\n"
+        "add x24, x24, #0x10\n"
+        "add x23, x23, #0x10\n"
+        "tbz %x[width], #1, 9f\n"
+        "ldr d5, [x28], #0x8\n"
+        "ldr d4, [x27], #0x8\n"
+        "ldr d17, [x24], #0x8\n"
+        "ldr d16, [x23], #0x8\n"
+        "tbz %x[width], #0, 16f\n"
+        "ld1 { v5.s }[2], [x28], #0x4\n"
+        "ld1 { v4.s }[2], [x27], #0x4\n"
+        "ld1 { v17.s }[2], [x24], #0x4\n"
+        "ld1 { v16.s }[2], [x23], #0x4\n"
+        "b 16f\n"
+        "9:" // tail loop: unique 1: partial_0_12
+        "tbz %x[width], #0, 16f\n"
+        "ldr s5, [x28], #0x4\n"
+        "ldr s4, [x27], #0x4\n"
+        "ldr s17, [x24], #0x4\n"
+        "ldr s16, [x23], #0x4\n"
+        "b 16f\n"
+        "10:" // tail loop: unique 1: partial_1_8
+        "tbz %x[width], #1, 11f\n"
+        "ldr d7, [x28], #0x8\n"
+        "ldr d6, [x27], #0x8\n"
+        "ldr d19, [x24], #0x8\n"
+        "ldr d18, [x23], #0x8\n"
+        "tbz %x[width], #0, 16f\n"
+        "ld1 { v7.s }[2], [x28], #0x4\n"
+        "ld1 { v6.s }[2], [x27], #0x4\n"
+        "ld1 { v19.s }[2], [x24], #0x4\n"
+        "ld1 { v18.s }[2], [x23], #0x4\n"
+        "b 16f\n"
+        "11:" // tail loop: unique 1: partial_0_8
+        "tbz %x[width], #0, 16f\n"
+        "ldr s7, [x28], #0x4\n"
+        "ldr s6, [x27], #0x4\n"
+        "ldr s19, [x24], #0x4\n"
+        "ldr s18, [x23], #0x4\n"
+        "b 16f\n"
+        "12:" // tail loop: unique 1: partial_2_0
+        "tbz %x[width], #2, 14f\n"
+        "ldr q11, [x28, #0x0]\n"
+        "ldr q10, [x27, #0x0]\n"
+        "add x28, x28, #0x10\n"
+        "add x27, x27, #0x10\n"
+        "ldr q3, [x24, #0x0]\n"
+        "ldr q22, [x23, #0x0]\n"
+        "add x24, x24, #0x10\n"
+        "add x23, x23, #0x10\n"
+        "tbz %x[width], #1, 13f\n"
+        "ldr d9, [x28], #0x8\n"
+        "ldr d8, [x27], #0x8\n"
+        "ldr d21, [x24], #0x8\n"
+        "ldr d20, [x23], #0x8\n"
+        "tbz %x[width], #0, 16f\n"
+        "ld1 { v9.s }[2], [x28], #0x4\n"
+        "ld1 { v8.s }[2], [x27], #0x4\n"
+        "ld1 { v21.s }[2], [x24], #0x4\n"
+        "ld1 { v20.s }[2], [x23], #0x4\n"
+        "b 16f\n"
+        "13:" // tail loop: unique 1: partial_0_4
+        "tbz %x[width], #0, 16f\n"
+        "ldr s9, [x28], #0x4\n"
+        "ldr s8, [x27], #0x4\n"
+        "ldr s21, [x24], #0x4\n"
+        "ldr s20, [x23], #0x4\n"
+        "b 16f\n"
+        "14:" // tail loop: unique 1: partial_1_0
+        "tbz %x[width], #1, 15f\n"
+        "ldr d11, [x28], #0x8\n"
+        "ldr d10, [x27], #0x8\n"
+        "ldr d3, [x24], #0x8\n"
+        "ldr d22, [x23], #0x8\n"
+        "tbz %x[width], #0, 16f\n"
+        "ld1 { v11.s }[2], [x28], #0x4\n"
+        "ld1 { v10.s }[2], [x27], #0x4\n"
+        "ld1 { v3.s }[2], [x24], #0x4\n"
+        "ld1 { v22.s }[2], [x23], #0x4\n"
+        "b 16f\n"
+        "15:" // tail loop: unique 1: partial_0_0
+        "ldr s11, [x28], #0x4\n"
+        "ldr s10, [x27], #0x4\n"
+        "ldr s3, [x24], #0x4\n"
+        "ldr s22, [x23], #0x4\n"
+        "16:" // tail loop: unique 1: Done
+        "fadd v2.4s, v11.4s, v10.4s\n"
+        "fadd v1.4s, v9.4s, v8.4s\n"
+        "fadd v0.4s, v7.4s, v6.4s\n"
+        "fadd v23.4s, v5.4s, v4.4s\n"
+        "fadd v22.4s, v3.4s, v22.4s\n"
+        "fadd v21.4s, v21.4s, v20.4s\n"
+        "fadd v20.4s, v19.4s, v18.4s\n"
+        "fadd v19.4s, v17.4s, v16.4s\n"
+        "cbz %x[out_direct], 25f\n"
+        "tbz %x[width], #3, 20f\n"
+        "str q2, [x25, #0x0]\n"
+        "str q1, [x25, #0x10]\n"
+        "add x25, x25, #0x20\n"
+        "str q22, [x21, #0x0]\n"
+        "str q21, [x21, #0x10]\n"
+        "add x21, x21, #0x20\n"
+        "tbz %x[width], #2, 18f\n"
+        "str q0, [x25, #0x0]\n"
+        "add x25, x25, #0x10\n"
+        "str q20, [x21, #0x0]\n"
+        "add x21, x21, #0x10\n"
+        "tbz %x[width], #1, 17f\n"
+        "str d23, [x25], #0x8\n"
+        "str d19, [x21], #0x8\n"
+        "tbz %x[width], #0, 24f\n"
+        "st1 { v23.s }[2], [x25], #0x4\n"
+        "st1 { v19.s }[2], [x21], #0x4\n"
+        "b 24f\n"
+        "17:" // tail loop: Main loop: unique 2: partial_0_12
+        "tbz %x[width], #0, 24f\n"
+        "str s23, [x25], #0x4\n"
+        "str s19, [x21], #0x4\n"
+        "b 24f\n"
+        "18:" // tail loop: Main loop: unique 2: partial_1_8
+        "tbz %x[width], #1, 19f\n"
+        "str d0, [x25], #0x8\n"
+        "str d20, [x21], #0x8\n"
+        "tbz %x[width], #0, 24f\n"
+        "st1 { v0.s }[2], [x25], #0x4\n"
+        "st1 { v20.s }[2], [x21], #0x4\n"
+        "b 24f\n"
+        "19:" // tail loop: Main loop: unique 2: partial_0_8
+        "tbz %x[width], #0, 24f\n"
+        "str s0, [x25], #0x4\n"
+        "str s20, [x21], #0x4\n"
+        "b 24f\n"
+        "20:" // tail loop: Main loop: unique 2: partial_2_0
+        "tbz %x[width], #2, 22f\n"
+        "str q2, [x25, #0x0]\n"
+        "add x25, x25, #0x10\n"
+        "str q22, [x21, #0x0]\n"
+        "add x21, x21, #0x10\n"
+        "tbz %x[width], #1, 21f\n"
+        "str d1, [x25], #0x8\n"
+        "str d21, [x21], #0x8\n"
+        "tbz %x[width], #0, 24f\n"
+        "st1 { v1.s }[2], [x25], #0x4\n"
+        "st1 { v21.s }[2], [x21], #0x4\n"
+        "b 24f\n"
+        "21:" // tail loop: Main loop: unique 2: partial_0_4
+        "tbz %x[width], #0, 24f\n"
+        "str s1, [x25], #0x4\n"
+        "str s21, [x21], #0x4\n"
+        "b 24f\n"
+        "22:" // tail loop: Main loop: unique 2: partial_1_0
+        "tbz %x[width], #1, 23f\n"
+        "str d2, [x25], #0x8\n"
+        "str d22, [x21], #0x8\n"
+        "tbz %x[width], #0, 24f\n"
+        "st1 { v2.s }[2], [x25], #0x4\n"
+        "st1 { v22.s }[2], [x21], #0x4\n"
+        "b 24f\n"
+        "23:" // tail loop: Main loop: unique 2: partial_0_0
+        "str s2, [x25], #0x4\n"
+        "str s22, [x21], #0x4\n"
+        "24:" // tail loop: Main loop: unique 2: Done
+        "25:" // tail loop: Main loop: No direct output
+        "mov v16.16b, v2.16b\n"
+        "mov v2.16b, v28.16b\n"
+        "fmla v2.4s, v16.4s, v24.4s\n"
+        "mov v16.16b, v1.16b\n"
+        "mov v1.16b, v29.16b\n"
+        "fmla v1.4s, v16.4s, v25.4s\n"
+        "mov v16.16b, v0.16b\n"
+        "mov v0.16b, v30.16b\n"
+        "fmla v0.4s, v16.4s, v26.4s\n"
+        "mov v16.16b, v23.16b\n"
+        "mov v23.16b, v31.16b\n"
+        "fmla v23.4s, v16.4s, v27.4s\n"
+        "mov v16.16b, v22.16b\n"
+        "mov v22.16b, v28.16b\n"
+        "fmla v22.4s, v16.4s, v24.4s\n"
+        "mov v16.16b, v21.16b\n"
+        "mov v21.16b, v29.16b\n"
+        "fmla v21.4s, v16.4s, v25.4s\n"
+        "mov v16.16b, v20.16b\n"
+        "mov v20.16b, v30.16b\n"
+        "fmla v20.4s, v16.4s, v26.4s\n"
+        "mov v16.16b, v19.16b\n"
+        "mov v19.16b, v31.16b\n"
+        "fmla v19.4s, v16.4s, v27.4s\n"
+        "fmin v2.4s, v2.4s, v12.4s\n"
+        "fmin v1.4s, v1.4s, v12.4s\n"
+        "fmin v0.4s, v0.4s, v12.4s\n"
+        "fmin v23.4s, v23.4s, v12.4s\n"
+        "fmin v22.4s, v22.4s, v12.4s\n"
+        "fmin v21.4s, v21.4s, v12.4s\n"
+        "fmin v20.4s, v20.4s, v12.4s\n"
+        "fmin v19.4s, v19.4s, v12.4s\n"
+        "fmax v2.4s, v2.4s, v13.4s\n"
+        "fmax v1.4s, v1.4s, v13.4s\n"
+        "fmax v0.4s, v0.4s, v13.4s\n"
+        "fmax v23.4s, v23.4s, v13.4s\n"
+        "fmax v22.4s, v22.4s, v13.4s\n"
+        "fmax v21.4s, v21.4s, v13.4s\n"
+        "fmax v20.4s, v20.4s, v13.4s\n"
+        "fmax v19.4s, v19.4s, v13.4s\n"
+        "tbz %x[width], #3, 29f\n"
+        "str q2, [x26, #0x0]\n"
+        "str q1, [x26, #0x10]\n"
+        "add x26, x26, #0x20\n"
+        "str q22, [x22, #0x0]\n"
+        "str q21, [x22, #0x10]\n"
+        "add x22, x22, #0x20\n"
+        "tbz %x[width], #2, 27f\n"
+        "str q0, [x26, #0x0]\n"
+        "add x26, x26, #0x10\n"
+        "str q20, [x22, #0x0]\n"
+        "add x22, x22, #0x10\n"
+        "tbz %x[width], #1, 26f\n"
+        "str d23, [x26], #0x8\n"
+        "str d19, [x22], #0x8\n"
+        "tbz %x[width], #0, 33f\n"
+        "st1 { v23.s }[2], [x26], #0x4\n"
+        "st1 { v19.s }[2], [x22], #0x4\n"
+        "b 33f\n"
+        "26:" // tail loop: unique 3: partial_0_12
+        "tbz %x[width], #0, 33f\n"
+        "str s23, [x26], #0x4\n"
+        "str s19, [x22], #0x4\n"
+        "b 33f\n"
+        "27:" // tail loop: unique 3: partial_1_8
+        "tbz %x[width], #1, 28f\n"
+        "str d0, [x26], #0x8\n"
+        "str d20, [x22], #0x8\n"
+        "tbz %x[width], #0, 33f\n"
+        "st1 { v0.s }[2], [x26], #0x4\n"
+        "st1 { v20.s }[2], [x22], #0x4\n"
+        "b 33f\n"
+        "28:" // tail loop: unique 3: partial_0_8
+        "tbz %x[width], #0, 33f\n"
+        "str s0, [x26], #0x4\n"
+        "str s20, [x22], #0x4\n"
+        "b 33f\n"
+        "29:" // tail loop: unique 3: partial_2_0
+        "tbz %x[width], #2, 31f\n"
+        "str q2, [x26, #0x0]\n"
+        "add x26, x26, #0x10\n"
+        "str q22, [x22, #0x0]\n"
+        "add x22, x22, #0x10\n"
+        "tbz %x[width], #1, 30f\n"
+        "str d1, [x26], #0x8\n"
+        "str d21, [x22], #0x8\n"
+        "tbz %x[width], #0, 33f\n"
+        "st1 { v1.s }[2], [x26], #0x4\n"
+        "st1 { v21.s }[2], [x22], #0x4\n"
+        "b 33f\n"
+        "30:" // tail loop: unique 3: partial_0_4
+        "tbz %x[width], #0, 33f\n"
+        "str s1, [x26], #0x4\n"
+        "str s21, [x22], #0x4\n"
+        "b 33f\n"
+        "31:" // tail loop: unique 3: partial_1_0
+        "tbz %x[width], #1, 32f\n"
+        "str d2, [x26], #0x8\n"
+        "str d22, [x22], #0x8\n"
+        "tbz %x[width], #0, 33f\n"
+        "st1 { v2.s }[2], [x26], #0x4\n"
+        "st1 { v22.s }[2], [x22], #0x4\n"
+        "b 33f\n"
+        "32:" // tail loop: unique 3: partial_0_0
+        "str s2, [x26], #0x4\n"
+        "str s22, [x22], #0x4\n"
+        "33:" // tail loop: unique 3: Done
+        "subs x20, x20, #0x2\n"
+        "bgt 8b\n"
+        "34:" // odd columns skip
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+}
+} // namespace
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_mul_add_fp32_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    const size_t out_stride        = final_output->info()->strides_in_bytes()[1];
+    const size_t out_direct_stride = (add_output != nullptr) ? add_output->info()->strides_in_bytes()[1] : 0;
+    const size_t in0_stride        = input1->info()->strides_in_bytes()[1];
+    const size_t in1_stride        = input2->info()->strides_in_bytes()[1];
+
+    float minval = std::numeric_limits<float>::lowest();
+    float maxval = std::numeric_limits<float>::max();
+
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    {
+        minval = 0.f;
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    {
+        minval = 0.f;
+        maxval = act_info.a();
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    {
+        minval = act_info.b();
+        maxval = act_info.a();
+    }
+
+    // Clear X & Y dimensions on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator in1_it(input1, window);
+    Iterator in2_it(input2, window);
+    Iterator out_it(final_output, window);
+
+    const size_t width  = window.num_iterations(0);
+    const size_t height = window.num_iterations(1);
+
+    if (add_output != nullptr)
+    {
+        Iterator add_out_it(add_output, window);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()),
+                    in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, out_it);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // __aarch64__
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..035805c944
--- /dev/null
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#ifdef __aarch64__
+namespace
+{
+void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t       *out,
+                                          size_t         out_stride,
+                                          uint8_t       *out_direct,
+                                          size_t         out_direct_stride,
+                                          const uint8_t *in0,
+                                          size_t         in0_stride,
+                                          const uint8_t *in1,
+                                          size_t         in1_stride,
+                                          const float   *bn_mul,
+                                          const float   *bn_add,
+                                          const uint8_t  minval,
+                                          const uint8_t  maxval,
+                                          int32_t        out_zeropt,
+                                          float          out_scale,
+                                          int32_t        out_direct_zeropt,
+                                          float          out_direct_scale,
+                                          int32_t        in0_zeropt,
+                                          float          in0_scale,
+                                          int32_t        in1_zeropt,
+                                          float          in1_scale,
+                                          size_t         width,
+                                          size_t         height)
+{
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
+    struct KernelArgs
+    {
+        const float *scales;
+        int32_t      in0_zeropt;
+        int32_t      in1_zeropt;
+        int32_t      out_zeropt;
+        int32_t      out_direct_zeropt;
+        int32_t      minval;
+        int32_t      maxval;
+    } ka;
+    ka.scales            = scales;
+    ka.in0_zeropt        = in0_zeropt;
+    ka.in1_zeropt        = in1_zeropt;
+    ka.out_zeropt        = out_zeropt;
+    ka.out_direct_zeropt = out_direct_zeropt;
+    ka.minval            = minval;
+    ka.maxval            = maxval;
+
+    __asm__ __volatile__(
+        "ldr x20, [%x[args_ptr], %[offsetof_scales]]\n"
+        "ld1 { v0.4s }, [x20]\n"
+        "cmp %x[width], #0x10\n"
+        "blt 5f\n"
+        "1:" // Column loop
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x23, %x[height]\n"
+        "mov x12, %x[in0]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x11, %x[in1]\n"
+        "mov x10, %x[out]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x9, %x[out_direct]\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "2:" // Row loop
+        "mov x28, x12\n"
+        "ldr d4, [x28, #0x0]\n"
+        "ldr d3, [x28, #0x8]\n"
+        "add x21, x28, %x[in0_stride]\n"
+        "mov x27, x11\n"
+        "ldr d13, [x27, #0x0]\n"
+        "ldr d12, [x27, #0x8]\n"
+        "cmp x23, #0x2\n"
+        "add x12, x21, %x[in0_stride]\n"
+        "csel x21, x21, x28, GE\n"
+        "ldr d2, [x21, #0x0]\n"
+        "ldr d11, [x21, #0x8]\n"
+        "add x20, x27, %x[in1_stride]\n"
+        "add x11, x20, %x[in1_stride]\n"
+        "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
+        "ushll v4.8h, v4.8b, #0x0\n"
+        "csel x20, x20, x27, GE\n"
+        "ldr d10, [x20, #0x0]\n"
+        "ldr d9, [x20, #0x8]\n"
+        "ushll v3.8h, v3.8b, #0x0\n"
+        "ushll v2.8h, v2.8b, #0x0\n"
+        "ushll v11.8h, v11.8b, #0x0\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
+        "mov x26, x10\n"
+        "dup v16.8h, w21\n"
+        "ushll v13.8h, v13.8b, #0x0\n"
+        "mov x25, x9\n"
+        "add x24, x26, %x[out_stride]\n"
+        "ushll v12.8h, v12.8b, #0x0\n"
+        "ushll v10.8h, v10.8b, #0x0\n"
+        "add x22, x25, %x[out_direct_stride]\n"
+        "add x10, x24, %x[out_stride]\n"
+        "ushll v9.8h, v9.8b, #0x0\n"
+        "ssubl v1.4s, v4.4h, v16.4h\n"
+        "add x9, x22, %x[out_direct_stride]\n"
+        "csel x24, x24, x26, GE\n"
+        "ssubl2 v4.4s, v4.8h, v16.8h\n"
+        "ssubl v23.4s, v3.4h, v16.4h\n"
+        "csel x22, x22, x25, GE\n"
+        "ssubl2 v3.4s, v3.8h, v16.8h\n"
+        "ssubl v22.4s, v2.4h, v16.4h\n"
+        "ssubl2 v2.4s, v2.8h, v16.8h\n"
+        "ssubl v21.4s, v11.4h, v16.4h\n"
+        "ssubl2 v11.4s, v11.8h, v16.8h\n"
+        "dup v20.8h, w20\n"
+        "ssubl v19.4s, v13.4h, v20.4h\n"
+        "ssubl2 v13.4s, v13.8h, v20.8h\n"
+        "ssubl v18.4s, v12.4h, v20.4h\n"
+        "ssubl2 v12.4s, v12.8h, v20.8h\n"
+        "ssubl v17.4s, v10.4h, v20.4h\n"
+        "ssubl2 v10.4s, v10.8h, v20.8h\n"
+        "ssubl v16.4s, v9.4h, v20.4h\n"
+        "ssubl2 v9.4s, v9.8h, v20.8h\n"
+        "scvtf v8.4s, v1.4s\n"
+        "scvtf v7.4s, v4.4s\n"
+        "scvtf v6.4s, v23.4s\n"
+        "scvtf v5.4s, v3.4s\n"
+        "scvtf v4.4s, v22.4s\n"
+        "scvtf v3.4s, v2.4s\n"
+        "scvtf v2.4s, v21.4s\n"
+        "scvtf v1.4s, v11.4s\n"
+        "scvtf v19.4s, v19.4s\n"
+        "fmul v8.4s, v8.4s, v0.s[0]\n"
+        "fmla v8.4s, v19.4s, v0.s[1]\n"
+        "scvtf v13.4s, v13.4s\n"
+        "fmul v7.4s, v7.4s, v0.s[0]\n"
+        "fmla v7.4s, v13.4s, v0.s[1]\n"
+        "scvtf v18.4s, v18.4s\n"
+        "fmul v6.4s, v6.4s, v0.s[0]\n"
+        "fmla v6.4s, v18.4s, v0.s[1]\n"
+        "scvtf v12.4s, v12.4s\n"
+        "fmul v5.4s, v5.4s, v0.s[0]\n"
+        "fmla v5.4s, v12.4s, v0.s[1]\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v4.4s, v4.4s, v0.s[0]\n"
+        "fmla v4.4s, v17.4s, v0.s[1]\n"
+        "scvtf v10.4s, v10.4s\n"
+        "fmul v3.4s, v3.4s, v0.s[0]\n"
+        "fmla v3.4s, v10.4s, v0.s[1]\n"
+        "scvtf v16.4s, v16.4s\n"
+        "fmul v2.4s, v2.4s, v0.s[0]\n"
+        "fmla v2.4s, v16.4s, v0.s[1]\n"
+        "scvtf v9.4s, v9.4s\n"
+        "fmul v1.4s, v1.4s, v0.s[0]\n"
+        "fmla v1.4s, v9.4s, v0.s[1]\n"
+        "cbz %x[out_direct], 3f\n"
+        "fmul v23.4s, v8.4s, v0.s[3]\n"
+        "fmul v22.4s, v7.4s, v0.s[3]\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
+        "fmul v21.4s, v6.4s, v0.s[3]\n"
+        "fmul v20.4s, v5.4s, v0.s[3]\n"
+        "fmul v19.4s, v4.4s, v0.s[3]\n"
+        "fmul v18.4s, v3.4s, v0.s[3]\n"
+        "fmul v16.4s, v2.4s, v0.s[3]\n"
+        "fmul v17.4s, v1.4s, v0.s[3]\n"
+        "fcvtas v23.4s, v23.4s\n"
+        "fcvtas v22.4s, v22.4s\n"
+        "fcvtas v21.4s, v21.4s\n"
+        "fcvtas v20.4s, v20.4s\n"
+        "fcvtas v19.4s, v19.4s\n"
+        "fcvtas v18.4s, v18.4s\n"
+        "fcvtas v16.4s, v16.4s\n"
+        "fcvtas v17.4s, v17.4s\n"
+        "uzp1 v22.8h, v23.8h, v22.8h\n"
+        "uzp1 v20.8h, v21.8h, v20.8h\n"
+        "uzp1 v18.8h, v19.8h, v18.8h\n"
+        "uzp1 v17.8h, v16.8h, v17.8h\n"
+        "dup v16.8h, w20\n"
+        "add v22.8h, v22.8h, v16.8h\n"
+        "add v20.8h, v20.8h, v16.8h\n"
+        "add v18.8h, v18.8h, v16.8h\n"
+        "add v17.8h, v17.8h, v16.8h\n"
+        "movi v16.8h, #0xff\n"
+        "smin v22.8h, v22.8h, v16.8h\n"
+        "smin v20.8h, v20.8h, v16.8h\n"
+        "smin v18.8h, v18.8h, v16.8h\n"
+        "smin v17.8h, v17.8h, v16.8h\n"
+        "movi v16.8h, #0x0\n"
+        "smax v22.8h, v22.8h, v16.8h\n"
+        "smax v20.8h, v20.8h, v16.8h\n"
+        "smax v18.8h, v18.8h, v16.8h\n"
+        "smax v17.8h, v17.8h, v16.8h\n"
+        "xtn v22.8b, v22.8h\n"
+        "str d22, [x25, #0x0]\n"
+        "xtn v20.8b, v20.8h\n"
+        "xtn v18.8b, v18.8h\n"
+        "str d20, [x25, #0x8]\n"
+        "xtn v17.8b, v17.8h\n"
+        "str d18, [x22, #0x0]\n"
+        "str d17, [x22, #0x8]\n"
+        "3:" // Main loop: No direct output
+        "mov v19.16b, v28.16b\n"
+        "mov v13.16b, v29.16b\n"
+        "fmla v19.4s, v8.4s, v24.4s\n"
+        "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
+        "mov v18.16b, v30.16b\n"
+        "mov v12.16b, v31.16b\n"
+        "fmla v13.4s, v7.4s, v25.4s\n"
+        "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
+        "mov v17.16b, v28.16b\n"
+        "mov v10.16b, v29.16b\n"
+        "fmla v18.4s, v6.4s, v26.4s\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
+        "mov v16.16b, v30.16b\n"
+        "mov v9.16b, v31.16b\n"
+        "fmla v12.4s, v5.4s, v27.4s\n"
+        "subs x23, x23, #0x2\n"
+        "fmla v17.4s, v4.4s, v24.4s\n"
+        "fmla v10.4s, v3.4s, v25.4s\n"
+        "fmul v8.4s, v19.4s, v0.s[2]\n"
+        "fmla v16.4s, v2.4s, v26.4s\n"
+        "fmla v9.4s, v1.4s, v27.4s\n"
+        "fmul v7.4s, v13.4s, v0.s[2]\n"
+        "fmul v6.4s, v18.4s, v0.s[2]\n"
+        "fmul v5.4s, v12.4s, v0.s[2]\n"
+        "fmul v4.4s, v17.4s, v0.s[2]\n"
+        "fmul v3.4s, v10.4s, v0.s[2]\n"
+        "fmul v2.4s, v16.4s, v0.s[2]\n"
+        "fmul v1.4s, v9.4s, v0.s[2]\n"
+        "fcvtas v8.4s, v8.4s\n"
+        "fcvtas v7.4s, v7.4s\n"
+        "fcvtas v6.4s, v6.4s\n"
+        "fcvtas v5.4s, v5.4s\n"
+        "fcvtas v4.4s, v4.4s\n"
+        "fcvtas v3.4s, v3.4s\n"
+        "fcvtas v2.4s, v2.4s\n"
+        "fcvtas v1.4s, v1.4s\n"
+        "uzp1 v7.8h, v8.8h, v7.8h\n"
+        "uzp1 v5.8h, v6.8h, v5.8h\n"
+        "uzp1 v3.8h, v4.8h, v3.8h\n"
+        "uzp1 v1.8h, v2.8h, v1.8h\n"
+        "dup v16.8h, w22\n"
+        "add v7.8h, v7.8h, v16.8h\n"
+        "add v5.8h, v5.8h, v16.8h\n"
+        "add v3.8h, v3.8h, v16.8h\n"
+        "add v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w21\n"
+        "smin v7.8h, v7.8h, v16.8h\n"
+        "smin v5.8h, v5.8h, v16.8h\n"
+        "smin v3.8h, v3.8h, v16.8h\n"
+        "smin v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w20\n"
+        "smax v7.8h, v7.8h, v16.8h\n"
+        "smax v5.8h, v5.8h, v16.8h\n"
+        "smax v3.8h, v3.8h, v16.8h\n"
+        "smax v1.8h, v1.8h, v16.8h\n"
+        "xtn v7.8b, v7.8h\n"
+        "str d7, [x26, #0x0]\n"
+        "xtn v5.8b, v5.8h\n"
+        "xtn v3.8b, v3.8h\n"
+        "str d5, [x26, #0x8]\n"
+        "xtn v1.8b, v1.8h\n"
+        "str d3, [x24, #0x0]\n"
+        "str d1, [x24, #0x8]\n"
+        "bgt 2b\n"
+        "add %x[in0], %x[in0], #0x10\n"
+        "add %x[in1], %x[in1], #0x10\n"
+        "add %x[out], %x[out], #0x10\n"
+        "cbz %x[out_direct], 4f\n"
+        "add %x[out_direct], %x[out_direct], #0x10\n"
+        "4:" // No direct pointer update
+        "sub %x[width], %x[width], #0x10\n"
+        "cmp %x[width], #0x10\n"
+        "bge 1b\n"
+        "cbz %x[width], 32f\n"
+        "5:" // main loop skip
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x23, %x[height]\n"
+        "mov x12, %x[in0]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x11, %x[in1]\n"
+        "mov x10, %x[out]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x9, %x[out_direct]\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "6:" // tail loop: Row loop
+        "mov x28, x12\n"
+        "mov x27, x11\n"
+        "mov x26, x10\n"
+        "mov x25, x9\n"
+        "add x21, x28, %x[in0_stride]\n"
+        "add x20, x27, %x[in1_stride]\n"
+        "add x24, x26, %x[out_stride]\n"
+        "add x22, x25, %x[out_direct_stride]\n"
+        "cmp x23, #0x2\n"
+        "add x12, x21, %x[in0_stride]\n"
+        "add x11, x20, %x[in1_stride]\n"
+        "add x10, x24, %x[out_stride]\n"
+        "add x9, x22, %x[out_direct_stride]\n"
+        "csel x21, x21, x28, GE\n"
+        "csel x20, x20, x27, GE\n"
+        "csel x24, x24, x26, GE\n"
+        "csel x22, x22, x25, GE\n"
+        "tbz %x[width], #3, 10f\n"
+        "ldr d4, [x28, #0x0]\n"
+        "ldr d13, [x27, #0x0]\n"
+        "add x28, x28, #0x8\n"
+        "add x27, x27, #0x8\n"
+        "ldr d2, [x21, #0x0]\n"
+        "ldr d10, [x20, #0x0]\n"
+        "add x21, x21, #0x8\n"
+        "add x20, x20, #0x8\n"
+        "tbz %x[width], #2, 8f\n"
+        "ldr s3, [x28], #0x4\n"
+        "ldr s12, [x27], #0x4\n"
+        "ldr s11, [x21], #0x4\n"
+        "ldr s9, [x20], #0x4\n"
+        "tbz %x[width], #1, 7f\n"
+        "ld1 { v3.h }[2], [x28], #0x2\n"
+        "ld1 { v12.h }[2], [x27], #0x2\n"
+        "ld1 { v11.h }[2], [x21], #0x2\n"
+        "ld1 { v9.h }[2], [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v3.b }[6], [x28], #0x1\n"
+        "ld1 { v12.b }[6], [x27], #0x1\n"
+        "ld1 { v11.b }[6], [x21], #0x1\n"
+        "ld1 { v9.b }[6], [x20], #0x1\n"
+        "b 14f\n"
+        "7:" // tail loop: unique 1: partial_0_12
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v3.b }[4], [x28], #0x1\n"
+        "ld1 { v12.b }[4], [x27], #0x1\n"
+        "ld1 { v11.b }[4], [x21], #0x1\n"
+        "ld1 { v9.b }[4], [x20], #0x1\n"
+        "b 14f\n"
+        "8:" // tail loop: unique 1: partial_1_8
+        "tbz %x[width], #1, 9f\n"
+        "ldr h3, [x28], #0x2\n"
+        "ldr h12, [x27], #0x2\n"
+        "ldr h11, [x21], #0x2\n"
+        "ldr h9, [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v3.b }[2], [x28], #0x1\n"
+        "ld1 { v12.b }[2], [x27], #0x1\n"
+        "ld1 { v11.b }[2], [x21], #0x1\n"
+        "ld1 { v9.b }[2], [x20], #0x1\n"
+        "b 14f\n"
+        "9:" // tail loop: unique 1: partial_0_8
+        "tbz %x[width], #0, 14f\n"
+        "ldr b3, [x28], #0x1\n"
+        "ldr b12, [x27], #0x1\n"
+        "ldr b11, [x21], #0x1\n"
+        "ldr b9, [x20], #0x1\n"
+        "b 14f\n"
+        "10:" // tail loop: unique 1: partial_2_0
+        "tbz %x[width], #2, 12f\n"
+        "ldr s4, [x28], #0x4\n"
+        "ldr s13, [x27], #0x4\n"
+        "ldr s2, [x21], #0x4\n"
+        "ldr s10, [x20], #0x4\n"
+        "tbz %x[width], #1, 11f\n"
+        "ld1 { v4.h }[2], [x28], #0x2\n"
+        "ld1 { v13.h }[2], [x27], #0x2\n"
+        "ld1 { v2.h }[2], [x21], #0x2\n"
+        "ld1 { v10.h }[2], [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v4.b }[6], [x28], #0x1\n"
+        "ld1 { v13.b }[6], [x27], #0x1\n"
+        "ld1 { v2.b }[6], [x21], #0x1\n"
+        "ld1 { v10.b }[6], [x20], #0x1\n"
+        "b 14f\n"
+        "11:" // tail loop: unique 1: partial_0_4
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v4.b }[4], [x28], #0x1\n"
+        "ld1 { v13.b }[4], [x27], #0x1\n"
+        "ld1 { v2.b }[4], [x21], #0x1\n"
+        "ld1 { v10.b }[4], [x20], #0x1\n"
+        "b 14f\n"
+        "12:" // tail loop: unique 1: partial_1_0
+        "tbz %x[width], #1, 13f\n"
+        "ldr h4, [x28], #0x2\n"
+        "ldr h13, [x27], #0x2\n"
+        "ldr h2, [x21], #0x2\n"
+        "ldr h10, [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v4.b }[2], [x28], #0x1\n"
+        "ld1 { v13.b }[2], [x27], #0x1\n"
+        "ld1 { v2.b }[2], [x21], #0x1\n"
+        "ld1 { v10.b }[2], [x20], #0x1\n"
+        "b 14f\n"
+        "13:" // tail loop: unique 1: partial_0_0
+        "ldr b4, [x28], #0x1\n"
+        "ldr b13, [x27], #0x1\n"
+        "ldr b2, [x21], #0x1\n"
+        "ldr b10, [x20], #0x1\n"
+        "14:" // tail loop: unique 1: Done
+        "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
+        "ushll v4.8h, v4.8b, #0x0\n"
+        "ushll v3.8h, v3.8b, #0x0\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
+        "ushll v2.8h, v2.8b, #0x0\n"
+        "ushll v11.8h, v11.8b, #0x0\n"
+        "dup v16.8h, w21\n"
+        "ushll v13.8h, v13.8b, #0x0\n"
+        "ushll v12.8h, v12.8b, #0x0\n"
+        "ushll v10.8h, v10.8b, #0x0\n"
+        "ushll v9.8h, v9.8b, #0x0\n"
+        "ssubl v1.4s, v4.4h, v16.4h\n"
+        "ssubl2 v4.4s, v4.8h, v16.8h\n"
+        "ssubl v23.4s, v3.4h, v16.4h\n"
+        "ssubl2 v3.4s, v3.8h, v16.8h\n"
+        "ssubl v22.4s, v2.4h, v16.4h\n"
+        "ssubl2 v2.4s, v2.8h, v16.8h\n"
+        "ssubl v21.4s, v11.4h, v16.4h\n"
+        "ssubl2 v11.4s, v11.8h, v16.8h\n"
+        "dup v20.8h, w20\n"
+        "ssubl v19.4s, v13.4h, v20.4h\n"
+        "ssubl2 v13.4s, v13.8h, v20.8h\n"
+        "ssubl v18.4s, v12.4h, v20.4h\n"
+        "ssubl2 v12.4s, v12.8h, v20.8h\n"
+        "ssubl v17.4s, v10.4h, v20.4h\n"
+        "ssubl2 v10.4s, v10.8h, v20.8h\n"
+        "ssubl v16.4s, v9.4h, v20.4h\n"
+        "ssubl2 v9.4s, v9.8h, v20.8h\n"
+        "scvtf v8.4s, v1.4s\n"
+        "scvtf v7.4s, v4.4s\n"
+        "scvtf v6.4s, v23.4s\n"
+        "scvtf v5.4s, v3.4s\n"
+        "scvtf v4.4s, v22.4s\n"
+        "scvtf v3.4s, v2.4s\n"
+        "scvtf v2.4s, v21.4s\n"
+        "scvtf v1.4s, v11.4s\n"
+        "scvtf v19.4s, v19.4s\n"
+        "fmul v8.4s, v8.4s, v0.s[0]\n"
+        "fmla v8.4s, v19.4s, v0.s[1]\n"
+        "scvtf v13.4s, v13.4s\n"
+        "fmul v7.4s, v7.4s, v0.s[0]\n"
+        "fmla v7.4s, v13.4s, v0.s[1]\n"
+        "scvtf v18.4s, v18.4s\n"
+        "fmul v6.4s, v6.4s, v0.s[0]\n"
+        "fmla v6.4s, v18.4s, v0.s[1]\n"
+        "scvtf v12.4s, v12.4s\n"
+        "fmul v5.4s, v5.4s, v0.s[0]\n"
+        "fmla v5.4s, v12.4s, v0.s[1]\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v4.4s, v4.4s, v0.s[0]\n"
+        "fmla v4.4s, v17.4s, v0.s[1]\n"
+        "scvtf v10.4s, v10.4s\n"
+        "fmul v3.4s, v3.4s, v0.s[0]\n"
+        "fmla v3.4s, v10.4s, v0.s[1]\n"
+        "scvtf v16.4s, v16.4s\n"
+        "fmul v2.4s, v2.4s, v0.s[0]\n"
+        "fmla v2.4s, v16.4s, v0.s[1]\n"
+        "scvtf v9.4s, v9.4s\n"
+        "fmul v1.4s, v1.4s, v0.s[0]\n"
+        "fmla v1.4s, v9.4s, v0.s[1]\n"
+        "cbz %x[out_direct], 23f\n"
+        "fmul v23.4s, v8.4s, v0.s[3]\n"
+        "fmul v22.4s, v7.4s, v0.s[3]\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
+        "fmul v21.4s, v6.4s, v0.s[3]\n"
+        "fmul v20.4s, v5.4s, v0.s[3]\n"
+        "fmul v19.4s, v4.4s, v0.s[3]\n"
+        "fmul v18.4s, v3.4s, v0.s[3]\n"
+        "fmul v16.4s, v2.4s, v0.s[3]\n"
+        "fmul v17.4s, v1.4s, v0.s[3]\n"
+        "fcvtas v23.4s, v23.4s\n"
+        "fcvtas v22.4s, v22.4s\n"
+        "fcvtas v21.4s, v21.4s\n"
+        "fcvtas v20.4s, v20.4s\n"
+        "fcvtas v19.4s, v19.4s\n"
+        "fcvtas v18.4s, v18.4s\n"
+        "fcvtas v16.4s, v16.4s\n"
+        "fcvtas v17.4s, v17.4s\n"
+        "uzp1 v22.8h, v23.8h, v22.8h\n"
+        "uzp1 v20.8h, v21.8h, v20.8h\n"
+        "uzp1 v18.8h, v19.8h, v18.8h\n"
+        "uzp1 v17.8h, v16.8h, v17.8h\n"
+        "dup v16.8h, w20\n"
+        "add v22.8h, v22.8h, v16.8h\n"
+        "add v20.8h, v20.8h, v16.8h\n"
+        "add v18.8h, v18.8h, v16.8h\n"
+        "add v17.8h, v17.8h, v16.8h\n"
+        "movi v16.8h, #0xff\n"
+        "smin v22.8h, v22.8h, v16.8h\n"
+        "smin v20.8h, v20.8h, v16.8h\n"
+        "smin v18.8h, v18.8h, v16.8h\n"
+        "smin v17.8h, v17.8h, v16.8h\n"
+        "movi v16.8h, #0x0\n"
+        "smax v22.8h, v22.8h, v16.8h\n"
+        "smax v20.8h, v20.8h, v16.8h\n"
+        "smax v18.8h, v18.8h, v16.8h\n"
+        "smax v17.8h, v17.8h, v16.8h\n"
+        "xtn v22.8b, v22.8h\n"
+        "xtn v20.8b, v20.8h\n"
+        "xtn v18.8b, v18.8h\n"
+        "xtn v17.8b, v17.8h\n"
+        "tbz %x[width], #3, 18f\n"
+        "str d22, [x25, #0x0]\n"
+        "add x25, x25, #0x8\n"
+        "str d18, [x22, #0x0]\n"
+        "add x22, x22, #0x8\n"
+        "tbz %x[width], #2, 16f\n"
+        "str s20, [x25], #0x4\n"
+        "str s17, [x22], #0x4\n"
+        "tbz %x[width], #1, 15f\n"
+        "st1 { v20.h }[2], [x25], #0x2\n"
+        "st1 { v17.h }[2], [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v20.b }[6], [x25], #0x1\n"
+        "st1 { v17.b }[6], [x22], #0x1\n"
+        "b 22f\n"
+        "15:" // tail loop: Main loop: unique 2: partial_0_12
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v20.b }[4], [x25], #0x1\n"
+        "st1 { v17.b }[4], [x22], #0x1\n"
+        "b 22f\n"
+        "16:" // tail loop: Main loop: unique 2: partial_1_8
+        "tbz %x[width], #1, 17f\n"
+        "str h20, [x25], #0x2\n"
+        "str h17, [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v20.b }[2], [x25], #0x1\n"
+        "st1 { v17.b }[2], [x22], #0x1\n"
+        "b 22f\n"
+        "17:" // tail loop: Main loop: unique 2: partial_0_8
+        "tbz %x[width], #0, 22f\n"
+        "str b20, [x25], #0x1\n"
+        "str b17, [x22], #0x1\n"
+        "b 22f\n"
+        "18:" // tail loop: Main loop: unique 2: partial_2_0
+        "tbz %x[width], #2, 20f\n"
+        "str s22, [x25], #0x4\n"
+        "str s18, [x22], #0x4\n"
+        "tbz %x[width], #1, 19f\n"
+        "st1 { v22.h }[2], [x25], #0x2\n"
+        "st1 { v18.h }[2], [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v22.b }[6], [x25], #0x1\n"
+        "st1 { v18.b }[6], [x22], #0x1\n"
+        "b 22f\n"
+        "19:" // tail loop: Main loop: unique 2: partial_0_4
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v22.b }[4], [x25], #0x1\n"
+        "st1 { v18.b }[4], [x22], #0x1\n"
+        "b 22f\n"
+        "20:" // tail loop: Main loop: unique 2: partial_1_0
+        "tbz %x[width], #1, 21f\n"
+        "str h22, [x25], #0x2\n"
+        "str h18, [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v22.b }[2], [x25], #0x1\n"
+        "st1 { v18.b }[2], [x22], #0x1\n"
+        "b 22f\n"
+        "21:" // tail loop: Main loop: unique 2: partial_0_0
+        "str b22, [x25], #0x1\n"
+        "str b18, [x22], #0x1\n"
+        "22:" // tail loop: Main loop: unique 2: Done
+        "23:" // tail loop: Main loop: No direct output
+        "mov v19.16b, v28.16b\n"
+        "mov v13.16b, v29.16b\n"
+        "fmla v19.4s, v8.4s, v24.4s\n"
+        "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
+        "mov v18.16b, v30.16b\n"
+        "mov v12.16b, v31.16b\n"
+        "fmla v13.4s, v7.4s, v25.4s\n"
+        "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
+        "mov v17.16b, v28.16b\n"
+        "mov v10.16b, v29.16b\n"
+        "fmla v18.4s, v6.4s, v26.4s\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
+        "mov v16.16b, v30.16b\n"
+        "mov v9.16b, v31.16b\n"
+        "fmla v12.4s, v5.4s, v27.4s\n"
+        "fmla v17.4s, v4.4s, v24.4s\n"
+        "fmla v10.4s, v3.4s, v25.4s\n"
+        "fmul v8.4s, v19.4s, v0.s[2]\n"
+        "fmla v16.4s, v2.4s, v26.4s\n"
+        "fmla v9.4s, v1.4s, v27.4s\n"
+        "fmul v7.4s, v13.4s, v0.s[2]\n"
+        "fmul v6.4s, v18.4s, v0.s[2]\n"
+        "fmul v5.4s, v12.4s, v0.s[2]\n"
+        "fmul v4.4s, v17.4s, v0.s[2]\n"
+        "fmul v3.4s, v10.4s, v0.s[2]\n"
+        "fmul v2.4s, v16.4s, v0.s[2]\n"
+        "fmul v1.4s, v9.4s, v0.s[2]\n"
+        "fcvtas v8.4s, v8.4s\n"
+        "fcvtas v7.4s, v7.4s\n"
+        "fcvtas v6.4s, v6.4s\n"
+        "fcvtas v5.4s, v5.4s\n"
+        "fcvtas v4.4s, v4.4s\n"
+        "fcvtas v3.4s, v3.4s\n"
+        "fcvtas v2.4s, v2.4s\n"
+        "fcvtas v1.4s, v1.4s\n"
+        "uzp1 v7.8h, v8.8h, v7.8h\n"
+        "uzp1 v5.8h, v6.8h, v5.8h\n"
+        "uzp1 v3.8h, v4.8h, v3.8h\n"
+        "uzp1 v1.8h, v2.8h, v1.8h\n"
+        "dup v16.8h, w22\n"
+        "add v7.8h, v7.8h, v16.8h\n"
+        "add v5.8h, v5.8h, v16.8h\n"
+        "add v3.8h, v3.8h, v16.8h\n"
+        "add v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w21\n"
+        "smin v7.8h, v7.8h, v16.8h\n"
+        "smin v5.8h, v5.8h, v16.8h\n"
+        "smin v3.8h, v3.8h, v16.8h\n"
+        "smin v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w20\n"
+        "smax v7.8h, v7.8h, v16.8h\n"
+        "smax v5.8h, v5.8h, v16.8h\n"
+        "smax v3.8h, v3.8h, v16.8h\n"
+        "smax v1.8h, v1.8h, v16.8h\n"
+        "xtn v7.8b, v7.8h\n"
+        "xtn v5.8b, v5.8h\n"
+        "xtn v3.8b, v3.8h\n"
+        "xtn v1.8b, v1.8h\n"
+        "tbz %x[width], #3, 27f\n"
+        "str d7, [x26, #0x0]\n"
+        "add x26, x26, #0x8\n"
+        "str d3, [x24, #0x0]\n"
+        "add x24, x24, #0x8\n"
+        "tbz %x[width], #2, 25f\n"
+        "str s5, [x26], #0x4\n"
+        "str s1, [x24], #0x4\n"
+        "tbz %x[width], #1, 24f\n"
+        "st1 { v5.h }[2], [x26], #0x2\n"
+        "st1 { v1.h }[2], [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v5.b }[6], [x26], #0x1\n"
+        "st1 { v1.b }[6], [x24], #0x1\n"
+        "b 31f\n"
+        "24:" // tail loop: unique 3: partial_0_12
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v5.b }[4], [x26], #0x1\n"
+        "st1 { v1.b }[4], [x24], #0x1\n"
+        "b 31f\n"
+        "25:" // tail loop: unique 3: partial_1_8
+        "tbz %x[width], #1, 26f\n"
+        "str h5, [x26], #0x2\n"
+        "str h1, [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v5.b }[2], [x26], #0x1\n"
+        "st1 { v1.b }[2], [x24], #0x1\n"
+        "b 31f\n"
+        "26:" // tail loop: unique 3: partial_0_8
+        "tbz %x[width], #0, 31f\n"
+        "str b5, [x26], #0x1\n"
+        "str b1, [x24], #0x1\n"
+        "b 31f\n"
+        "27:" // tail loop: unique 3: partial_2_0
+        "tbz %x[width], #2, 29f\n"
+        "str s7, [x26], #0x4\n"
+        "str s3, [x24], #0x4\n"
+        "tbz %x[width], #1, 28f\n"
+        "st1 { v7.h }[2], [x26], #0x2\n"
+        "st1 { v3.h }[2], [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v7.b }[6], [x26], #0x1\n"
+        "st1 { v3.b }[6], [x24], #0x1\n"
+        "b 31f\n"
+        "28:" // tail loop: unique 3: partial_0_4
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v7.b }[4], [x26], #0x1\n"
+        "st1 { v3.b }[4], [x24], #0x1\n"
+        "b 31f\n"
+        "29:" // tail loop: unique 3: partial_1_0
+        "tbz %x[width], #1, 30f\n"
+        "str h7, [x26], #0x2\n"
+        "str h3, [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v7.b }[2], [x26], #0x1\n"
+        "st1 { v3.b }[2], [x24], #0x1\n"
+        "b 31f\n"
+        "30:" // tail loop: unique 3: partial_0_0
+        "str b7, [x26], #0x1\n"
+        "str b3, [x24], #0x1\n"
+        "31:" // tail loop: unique 3: Done
+        "subs x23, x23, #0x2\n"
+        "bgt 6b\n"
+        "32:" // odd columns skip
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+}
+
+} // namespace
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_mul_add_u8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    const ITensorInfo *final_output_info = final_output->info();
+    const ITensorInfo *add_output_info   = (add_output != nullptr) ? add_output->info() : nullptr;
+    const ITensorInfo *input1_info       = input1->info();
+    const ITensorInfo *input2_info       = input2->info();
+
+    const size_t out_stride        = final_output_info->strides_in_bytes()[1];
+    const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0;
+    const size_t in0_stride        = input1_info->strides_in_bytes()[1];
+    const size_t in1_stride        = input2_info->strides_in_bytes()[1];
+
+    uint8_t minval = std::numeric_limits<uint8_t>::lowest();
+    uint8_t maxval = std::numeric_limits<uint8_t>::max();
+
+    const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    {
+        minval = quantize_qasymm8(0.f, final_output_qinfo);
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    {
+        minval = quantize_qasymm8(0.f, final_output_qinfo);
+        maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    {
+        minval = quantize_qasymm8(act_info.b(), final_output_qinfo);
+        maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
+    }
+
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+
+    const int32_t in1_offset        = in1_qinfo.offset;
+    const int32_t in2_offset        = in2_qinfo.offset;
+    const int32_t out_offset        = final_output_qinfo.offset;
+    const int32_t out_direct_offset = add_output_qinfo.offset;
+
+    const float in1_scale        = in1_qinfo.scale;
+    const float in2_scale        = in2_qinfo.scale;
+    const float out_scale        = final_output_qinfo.scale;
+    const float out_direct_scale = add_output_qinfo.scale;
+
+    const float *bn_mul_buffer = reinterpret_cast<float *>(bn_mul->buffer());
+    const float *bn_add_buffer = reinterpret_cast<float *>(bn_add->buffer());
+
+    // Clear X & Y dimensions on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator in1_it(input1, window);
+    Iterator in2_it(input2, window);
+    Iterator out_it(final_output, window);
+
+    const size_t width  = window.num_iterations(0);
+    const size_t height = window.num_iterations(1);
+
+    if (add_output != nullptr)
+    {
+        Iterator add_out_it(add_output, window);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
+                    reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // __aarch64__
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..e1a45b467b
--- /dev/null
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#ifdef __aarch64__
+namespace
+{
+void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t       *out,
+                                          size_t        out_stride,
+                                          int8_t       *out_direct,
+                                          size_t        out_direct_stride,
+                                          const int8_t *in0,
+                                          size_t        in0_stride,
+                                          const int8_t *in1,
+                                          size_t        in1_stride,
+                                          const float  *bn_mul,
+                                          const float  *bn_add,
+                                          const int8_t  minval,
+                                          const int8_t  maxval,
+                                          int32_t       out_zeropt,
+                                          float         out_scale,
+                                          int32_t       out_direct_zeropt,
+                                          float         out_direct_scale,
+                                          int32_t       in0_zeropt,
+                                          float         in0_scale,
+                                          int32_t       in1_zeropt,
+                                          float         in1_scale,
+                                          size_t        width,
+                                          size_t        height)
+{
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
+    struct KernelArgs
+    {
+        const float *scales;
+        int32_t      in0_zeropt;
+        int32_t      in1_zeropt;
+        int32_t      out_zeropt;
+        int32_t      out_direct_zeropt;
+        int32_t      minval;
+        int32_t      maxval;
+    } ka;
+    ka.scales            = scales;
+    ka.in0_zeropt        = in0_zeropt;
+    ka.in1_zeropt        = in1_zeropt;
+    ka.out_zeropt        = out_zeropt;
+    ka.out_direct_zeropt = out_direct_zeropt;
+    ka.minval            = minval;
+    ka.maxval            = maxval;
+
+    __asm__ __volatile__(
+        "ldr x20, [%x[args_ptr], %[offsetof_scales]]\n"
+        "ld1 { v0.4s }, [x20]\n"
+        "cmp %x[width], #0x10\n"
+        "blt 5f\n"
+        "1:" // Column loop
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x23, %x[height]\n"
+        "mov x12, %x[in0]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x11, %x[in1]\n"
+        "mov x10, %x[out]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x9, %x[out_direct]\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "2:" // Row loop
+        "mov x28, x12\n"
+        "ldr d4, [x28, #0x0]\n"
+        "ldr d3, [x28, #0x8]\n"
+        "add x21, x28, %x[in0_stride]\n"
+        "mov x27, x11\n"
+        "ldr d13, [x27, #0x0]\n"
+        "ldr d12, [x27, #0x8]\n"
+        "cmp x23, #0x2\n"
+        "add x12, x21, %x[in0_stride]\n"
+        "csel x21, x21, x28, GE\n"
+        "ldr d2, [x21, #0x0]\n"
+        "ldr d11, [x21, #0x8]\n"
+        "add x20, x27, %x[in1_stride]\n"
+        "add x11, x20, %x[in1_stride]\n"
+        "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
+        "sshll v4.8h, v4.8b, #0x0\n"
+        "csel x20, x20, x27, GE\n"
+        "ldr d10, [x20, #0x0]\n"
+        "ldr d9, [x20, #0x8]\n"
+        "sshll v3.8h, v3.8b, #0x0\n"
+        "sshll v2.8h, v2.8b, #0x0\n"
+        "sshll v11.8h, v11.8b, #0x0\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
+        "mov x26, x10\n"
+        "dup v16.8h, w21\n"
+        "sshll v13.8h, v13.8b, #0x0\n"
+        "mov x25, x9\n"
+        "add x24, x26, %x[out_stride]\n"
+        "sshll v12.8h, v12.8b, #0x0\n"
+        "sshll v10.8h, v10.8b, #0x0\n"
+        "add x22, x25, %x[out_direct_stride]\n"
+        "add x10, x24, %x[out_stride]\n"
+        "sshll v9.8h, v9.8b, #0x0\n"
+        "ssubl v1.4s, v4.4h, v16.4h\n"
+        "add x9, x22, %x[out_direct_stride]\n"
+        "csel x24, x24, x26, GE\n"
+        "ssubl2 v4.4s, v4.8h, v16.8h\n"
+        "ssubl v23.4s, v3.4h, v16.4h\n"
+        "csel x22, x22, x25, GE\n"
+        "ssubl2 v3.4s, v3.8h, v16.8h\n"
+        "ssubl v22.4s, v2.4h, v16.4h\n"
+        "ssubl2 v2.4s, v2.8h, v16.8h\n"
+        "ssubl v21.4s, v11.4h, v16.4h\n"
+        "ssubl2 v11.4s, v11.8h, v16.8h\n"
+        "dup v20.8h, w20\n"
+        "ssubl v19.4s, v13.4h, v20.4h\n"
+        "ssubl2 v13.4s, v13.8h, v20.8h\n"
+        "ssubl v18.4s, v12.4h, v20.4h\n"
+        "ssubl2 v12.4s, v12.8h, v20.8h\n"
+        "ssubl v17.4s, v10.4h, v20.4h\n"
+        "ssubl2 v10.4s, v10.8h, v20.8h\n"
+        "ssubl v16.4s, v9.4h, v20.4h\n"
+        "ssubl2 v9.4s, v9.8h, v20.8h\n"
+        "scvtf v8.4s, v1.4s\n"
+        "scvtf v7.4s, v4.4s\n"
+        "scvtf v6.4s, v23.4s\n"
+        "scvtf v5.4s, v3.4s\n"
+        "scvtf v4.4s, v22.4s\n"
+        "scvtf v3.4s, v2.4s\n"
+        "scvtf v2.4s, v21.4s\n"
+        "scvtf v1.4s, v11.4s\n"
+        "scvtf v19.4s, v19.4s\n"
+        "fmul v8.4s, v8.4s, v0.s[0]\n"
+        "fmla v8.4s, v19.4s, v0.s[1]\n"
+        "scvtf v13.4s, v13.4s\n"
+        "fmul v7.4s, v7.4s, v0.s[0]\n"
+        "fmla v7.4s, v13.4s, v0.s[1]\n"
+        "scvtf v18.4s, v18.4s\n"
+        "fmul v6.4s, v6.4s, v0.s[0]\n"
+        "fmla v6.4s, v18.4s, v0.s[1]\n"
+        "scvtf v12.4s, v12.4s\n"
+        "fmul v5.4s, v5.4s, v0.s[0]\n"
+        "fmla v5.4s, v12.4s, v0.s[1]\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v4.4s, v4.4s, v0.s[0]\n"
+        "fmla v4.4s, v17.4s, v0.s[1]\n"
+        "scvtf v10.4s, v10.4s\n"
+        "fmul v3.4s, v3.4s, v0.s[0]\n"
+        "fmla v3.4s, v10.4s, v0.s[1]\n"
+        "scvtf v16.4s, v16.4s\n"
+        "fmul v2.4s, v2.4s, v0.s[0]\n"
+        "fmla v2.4s, v16.4s, v0.s[1]\n"
+        "scvtf v9.4s, v9.4s\n"
+        "fmul v1.4s, v1.4s, v0.s[0]\n"
+        "fmla v1.4s, v9.4s, v0.s[1]\n"
+        "cbz %x[out_direct], 3f\n"
+        "fmul v23.4s, v8.4s, v0.s[3]\n"
+        "fmul v22.4s, v7.4s, v0.s[3]\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
+        "fmul v21.4s, v6.4s, v0.s[3]\n"
+        "fmul v20.4s, v5.4s, v0.s[3]\n"
+        "fmul v17.4s, v4.4s, v0.s[3]\n"
+        "fmul v19.4s, v3.4s, v0.s[3]\n"
+        "fmul v16.4s, v2.4s, v0.s[3]\n"
+        "fmul v18.4s, v1.4s, v0.s[3]\n"
+        "fcvtas v23.4s, v23.4s\n"
+        "fcvtas v22.4s, v22.4s\n"
+        "fcvtas v21.4s, v21.4s\n"
+        "fcvtas v20.4s, v20.4s\n"
+        "fcvtas v17.4s, v17.4s\n"
+        "fcvtas v19.4s, v19.4s\n"
+        "fcvtas v16.4s, v16.4s\n"
+        "fcvtas v18.4s, v18.4s\n"
+        "uzp1 v22.8h, v23.8h, v22.8h\n"
+        "uzp1 v20.8h, v21.8h, v20.8h\n"
+        "uzp1 v19.8h, v17.8h, v19.8h\n"
+        "uzp1 v18.8h, v16.8h, v18.8h\n"
+        "dup v16.8h, w20\n"
+        "add v22.8h, v22.8h, v16.8h\n"
+        "add v20.8h, v20.8h, v16.8h\n"
+        "add v19.8h, v19.8h, v16.8h\n"
+        "add v18.8h, v18.8h, v16.8h\n"
+        "movi v17.8h, #0x7f\n"
+        "mvni v16.8h, #0x7f\n"
+        "smin v22.8h, v22.8h, v17.8h\n"
+        "smin v20.8h, v20.8h, v17.8h\n"
+        "smin v19.8h, v19.8h, v17.8h\n"
+        "smin v18.8h, v18.8h, v17.8h\n"
+        "smax v22.8h, v22.8h, v16.8h\n"
+        "smax v20.8h, v20.8h, v16.8h\n"
+        "smax v19.8h, v19.8h, v16.8h\n"
+        "smax v18.8h, v18.8h, v16.8h\n"
+        "xtn v22.8b, v22.8h\n"
+        "str d22, [x25, #0x0]\n"
+        "xtn v20.8b, v20.8h\n"
+        "xtn v19.8b, v19.8h\n"
+        "str d20, [x25, #0x8]\n"
+        "xtn v18.8b, v18.8h\n"
+        "str d19, [x22, #0x0]\n"
+        "str d18, [x22, #0x8]\n"
+        "3:" // Main loop: No direct output
+        "mov v19.16b, v28.16b\n"
+        "mov v13.16b, v29.16b\n"
+        "fmla v19.4s, v8.4s, v24.4s\n"
+        "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
+        "mov v18.16b, v30.16b\n"
+        "mov v12.16b, v31.16b\n"
+        "fmla v13.4s, v7.4s, v25.4s\n"
+        "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
+        "mov v17.16b, v28.16b\n"
+        "mov v10.16b, v29.16b\n"
+        "fmla v18.4s, v6.4s, v26.4s\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
+        "mov v16.16b, v30.16b\n"
+        "mov v9.16b, v31.16b\n"
+        "fmla v12.4s, v5.4s, v27.4s\n"
+        "subs x23, x23, #0x2\n"
+        "fmla v17.4s, v4.4s, v24.4s\n"
+        "fmla v10.4s, v3.4s, v25.4s\n"
+        "fmul v8.4s, v19.4s, v0.s[2]\n"
+        "fmla v16.4s, v2.4s, v26.4s\n"
+        "fmla v9.4s, v1.4s, v27.4s\n"
+        "fmul v7.4s, v13.4s, v0.s[2]\n"
+        "fmul v6.4s, v18.4s, v0.s[2]\n"
+        "fmul v5.4s, v12.4s, v0.s[2]\n"
+        "fmul v4.4s, v17.4s, v0.s[2]\n"
+        "fmul v3.4s, v10.4s, v0.s[2]\n"
+        "fmul v2.4s, v16.4s, v0.s[2]\n"
+        "fmul v1.4s, v9.4s, v0.s[2]\n"
+        "fcvtas v8.4s, v8.4s\n"
+        "fcvtas v7.4s, v7.4s\n"
+        "fcvtas v6.4s, v6.4s\n"
+        "fcvtas v5.4s, v5.4s\n"
+        "fcvtas v4.4s, v4.4s\n"
+        "fcvtas v3.4s, v3.4s\n"
+        "fcvtas v2.4s, v2.4s\n"
+        "fcvtas v1.4s, v1.4s\n"
+        "uzp1 v7.8h, v8.8h, v7.8h\n"
+        "uzp1 v5.8h, v6.8h, v5.8h\n"
+        "uzp1 v3.8h, v4.8h, v3.8h\n"
+        "uzp1 v1.8h, v2.8h, v1.8h\n"
+        "dup v16.8h, w22\n"
+        "add v7.8h, v7.8h, v16.8h\n"
+        "add v5.8h, v5.8h, v16.8h\n"
+        "add v3.8h, v3.8h, v16.8h\n"
+        "add v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w21\n"
+        "smin v7.8h, v7.8h, v16.8h\n"
+        "smin v5.8h, v5.8h, v16.8h\n"
+        "smin v3.8h, v3.8h, v16.8h\n"
+        "smin v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w20\n"
+        "smax v7.8h, v7.8h, v16.8h\n"
+        "smax v5.8h, v5.8h, v16.8h\n"
+        "smax v3.8h, v3.8h, v16.8h\n"
+        "smax v1.8h, v1.8h, v16.8h\n"
+        "xtn v7.8b, v7.8h\n"
+        "str d7, [x26, #0x0]\n"
+        "xtn v5.8b, v5.8h\n"
+        "xtn v3.8b, v3.8h\n"
+        "str d5, [x26, #0x8]\n"
+        "xtn v1.8b, v1.8h\n"
+        "str d3, [x24, #0x0]\n"
+        "str d1, [x24, #0x8]\n"
+        "bgt 2b\n"
+        "add %x[in0], %x[in0], #0x10\n"
+        "add %x[in1], %x[in1], #0x10\n"
+        "add %x[out], %x[out], #0x10\n"
+        "cbz %x[out_direct], 4f\n"
+        "add %x[out_direct], %x[out_direct], #0x10\n"
+        "4:" // No direct pointer update
+        "sub %x[width], %x[width], #0x10\n"
+        "cmp %x[width], #0x10\n"
+        "bge 1b\n"
+        "cbz %x[width], 32f\n"
+        "5:" // main loop skip
+        "ldr q24, [%x[bn_mul], #0x0]\n"
+        "ldr q25, [%x[bn_mul], #0x10]\n"
+        "mov x23, %x[height]\n"
+        "mov x12, %x[in0]\n"
+        "ldr q26, [%x[bn_mul], #0x20]\n"
+        "ldr q27, [%x[bn_mul], #0x30]\n"
+        "mov x11, %x[in1]\n"
+        "mov x10, %x[out]\n"
+        "ldr q28, [%x[bn_add], #0x0]\n"
+        "ldr q29, [%x[bn_add], #0x10]\n"
+        "mov x9, %x[out_direct]\n"
+        "add %x[bn_mul], %x[bn_mul], #0x40\n"
+        "ldr q30, [%x[bn_add], #0x20]\n"
+        "ldr q31, [%x[bn_add], #0x30]\n"
+        "add %x[bn_add], %x[bn_add], #0x40\n"
+        "6:" // tail loop: Row loop
+        "mov x28, x12\n"
+        "mov x27, x11\n"
+        "mov x26, x10\n"
+        "mov x25, x9\n"
+        "add x21, x28, %x[in0_stride]\n"
+        "add x20, x27, %x[in1_stride]\n"
+        "add x24, x26, %x[out_stride]\n"
+        "add x22, x25, %x[out_direct_stride]\n"
+        "cmp x23, #0x2\n"
+        "add x12, x21, %x[in0_stride]\n"
+        "add x11, x20, %x[in1_stride]\n"
+        "add x10, x24, %x[out_stride]\n"
+        "add x9, x22, %x[out_direct_stride]\n"
+        "csel x21, x21, x28, GE\n"
+        "csel x20, x20, x27, GE\n"
+        "csel x24, x24, x26, GE\n"
+        "csel x22, x22, x25, GE\n"
+        "tbz %x[width], #3, 10f\n"
+        "ldr d4, [x28, #0x0]\n"
+        "ldr d13, [x27, #0x0]\n"
+        "add x28, x28, #0x8\n"
+        "add x27, x27, #0x8\n"
+        "ldr d2, [x21, #0x0]\n"
+        "ldr d10, [x20, #0x0]\n"
+        "add x21, x21, #0x8\n"
+        "add x20, x20, #0x8\n"
+        "tbz %x[width], #2, 8f\n"
+        "ldr s3, [x28], #0x4\n"
+        "ldr s12, [x27], #0x4\n"
+        "ldr s11, [x21], #0x4\n"
+        "ldr s9, [x20], #0x4\n"
+        "tbz %x[width], #1, 7f\n"
+        "ld1 { v3.h }[2], [x28], #0x2\n"
+        "ld1 { v12.h }[2], [x27], #0x2\n"
+        "ld1 { v11.h }[2], [x21], #0x2\n"
+        "ld1 { v9.h }[2], [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v3.b }[6], [x28], #0x1\n"
+        "ld1 { v12.b }[6], [x27], #0x1\n"
+        "ld1 { v11.b }[6], [x21], #0x1\n"
+        "ld1 { v9.b }[6], [x20], #0x1\n"
+        "b 14f\n"
+        "7:" // tail loop: unique 1: partial_0_12
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v3.b }[4], [x28], #0x1\n"
+        "ld1 { v12.b }[4], [x27], #0x1\n"
+        "ld1 { v11.b }[4], [x21], #0x1\n"
+        "ld1 { v9.b }[4], [x20], #0x1\n"
+        "b 14f\n"
+        "8:" // tail loop: unique 1: partial_1_8
+        "tbz %x[width], #1, 9f\n"
+        "ldr h3, [x28], #0x2\n"
+        "ldr h12, [x27], #0x2\n"
+        "ldr h11, [x21], #0x2\n"
+        "ldr h9, [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v3.b }[2], [x28], #0x1\n"
+        "ld1 { v12.b }[2], [x27], #0x1\n"
+        "ld1 { v11.b }[2], [x21], #0x1\n"
+        "ld1 { v9.b }[2], [x20], #0x1\n"
+        "b 14f\n"
+        "9:" // tail loop: unique 1: partial_0_8
+        "tbz %x[width], #0, 14f\n"
+        "ldr b3, [x28], #0x1\n"
+        "ldr b12, [x27], #0x1\n"
+        "ldr b11, [x21], #0x1\n"
+        "ldr b9, [x20], #0x1\n"
+        "b 14f\n"
+        "10:" // tail loop: unique 1: partial_2_0
+        "tbz %x[width], #2, 12f\n"
+        "ldr s4, [x28], #0x4\n"
+        "ldr s13, [x27], #0x4\n"
+        "ldr s2, [x21], #0x4\n"
+        "ldr s10, [x20], #0x4\n"
+        "tbz %x[width], #1, 11f\n"
+        "ld1 { v4.h }[2], [x28], #0x2\n"
+        "ld1 { v13.h }[2], [x27], #0x2\n"
+        "ld1 { v2.h }[2], [x21], #0x2\n"
+        "ld1 { v10.h }[2], [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v4.b }[6], [x28], #0x1\n"
+        "ld1 { v13.b }[6], [x27], #0x1\n"
+        "ld1 { v2.b }[6], [x21], #0x1\n"
+        "ld1 { v10.b }[6], [x20], #0x1\n"
+        "b 14f\n"
+        "11:" // tail loop: unique 1: partial_0_4
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v4.b }[4], [x28], #0x1\n"
+        "ld1 { v13.b }[4], [x27], #0x1\n"
+        "ld1 { v2.b }[4], [x21], #0x1\n"
+        "ld1 { v10.b }[4], [x20], #0x1\n"
+        "b 14f\n"
+        "12:" // tail loop: unique 1: partial_1_0
+        "tbz %x[width], #1, 13f\n"
+        "ldr h4, [x28], #0x2\n"
+        "ldr h13, [x27], #0x2\n"
+        "ldr h2, [x21], #0x2\n"
+        "ldr h10, [x20], #0x2\n"
+        "tbz %x[width], #0, 14f\n"
+        "ld1 { v4.b }[2], [x28], #0x1\n"
+        "ld1 { v13.b }[2], [x27], #0x1\n"
+        "ld1 { v2.b }[2], [x21], #0x1\n"
+        "ld1 { v10.b }[2], [x20], #0x1\n"
+        "b 14f\n"
+        "13:" // tail loop: unique 1: partial_0_0
+        "ldr b4, [x28], #0x1\n"
+        "ldr b13, [x27], #0x1\n"
+        "ldr b2, [x21], #0x1\n"
+        "ldr b10, [x20], #0x1\n"
+        "14:" // tail loop: unique 1: Done
+        "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
+        "sshll v4.8h, v4.8b, #0x0\n"
+        "sshll v3.8h, v3.8b, #0x0\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
+        "sshll v2.8h, v2.8b, #0x0\n"
+        "sshll v11.8h, v11.8b, #0x0\n"
+        "dup v16.8h, w21\n"
+        "sshll v13.8h, v13.8b, #0x0\n"
+        "sshll v12.8h, v12.8b, #0x0\n"
+        "sshll v10.8h, v10.8b, #0x0\n"
+        "sshll v9.8h, v9.8b, #0x0\n"
+        "ssubl v1.4s, v4.4h, v16.4h\n"
+        "ssubl2 v4.4s, v4.8h, v16.8h\n"
+        "ssubl v23.4s, v3.4h, v16.4h\n"
+        "ssubl2 v3.4s, v3.8h, v16.8h\n"
+        "ssubl v22.4s, v2.4h, v16.4h\n"
+        "ssubl2 v2.4s, v2.8h, v16.8h\n"
+        "ssubl v21.4s, v11.4h, v16.4h\n"
+        "ssubl2 v11.4s, v11.8h, v16.8h\n"
+        "dup v20.8h, w20\n"
+        "ssubl v19.4s, v13.4h, v20.4h\n"
+        "ssubl2 v13.4s, v13.8h, v20.8h\n"
+        "ssubl v18.4s, v12.4h, v20.4h\n"
+        "ssubl2 v12.4s, v12.8h, v20.8h\n"
+        "ssubl v17.4s, v10.4h, v20.4h\n"
+        "ssubl2 v10.4s, v10.8h, v20.8h\n"
+        "ssubl v16.4s, v9.4h, v20.4h\n"
+        "ssubl2 v9.4s, v9.8h, v20.8h\n"
+        "scvtf v8.4s, v1.4s\n"
+        "scvtf v7.4s, v4.4s\n"
+        "scvtf v6.4s, v23.4s\n"
+        "scvtf v5.4s, v3.4s\n"
+        "scvtf v4.4s, v22.4s\n"
+        "scvtf v3.4s, v2.4s\n"
+        "scvtf v2.4s, v21.4s\n"
+        "scvtf v1.4s, v11.4s\n"
+        "scvtf v19.4s, v19.4s\n"
+        "fmul v8.4s, v8.4s, v0.s[0]\n"
+        "fmla v8.4s, v19.4s, v0.s[1]\n"
+        "scvtf v13.4s, v13.4s\n"
+        "fmul v7.4s, v7.4s, v0.s[0]\n"
+        "fmla v7.4s, v13.4s, v0.s[1]\n"
+        "scvtf v18.4s, v18.4s\n"
+        "fmul v6.4s, v6.4s, v0.s[0]\n"
+        "fmla v6.4s, v18.4s, v0.s[1]\n"
+        "scvtf v12.4s, v12.4s\n"
+        "fmul v5.4s, v5.4s, v0.s[0]\n"
+        "fmla v5.4s, v12.4s, v0.s[1]\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v4.4s, v4.4s, v0.s[0]\n"
+        "fmla v4.4s, v17.4s, v0.s[1]\n"
+        "scvtf v10.4s, v10.4s\n"
+        "fmul v3.4s, v3.4s, v0.s[0]\n"
+        "fmla v3.4s, v10.4s, v0.s[1]\n"
+        "scvtf v16.4s, v16.4s\n"
+        "fmul v2.4s, v2.4s, v0.s[0]\n"
+        "fmla v2.4s, v16.4s, v0.s[1]\n"
+        "scvtf v9.4s, v9.4s\n"
+        "fmul v1.4s, v1.4s, v0.s[0]\n"
+        "fmla v1.4s, v9.4s, v0.s[1]\n"
+        "cbz %x[out_direct], 23f\n"
+        "fmul v23.4s, v8.4s, v0.s[3]\n"
+        "fmul v22.4s, v7.4s, v0.s[3]\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
+        "fmul v21.4s, v6.4s, v0.s[3]\n"
+        "fmul v20.4s, v5.4s, v0.s[3]\n"
+        "fmul v17.4s, v4.4s, v0.s[3]\n"
+        "fmul v19.4s, v3.4s, v0.s[3]\n"
+        "fmul v16.4s, v2.4s, v0.s[3]\n"
+        "fmul v18.4s, v1.4s, v0.s[3]\n"
+        "fcvtas v23.4s, v23.4s\n"
+        "fcvtas v22.4s, v22.4s\n"
+        "fcvtas v21.4s, v21.4s\n"
+        "fcvtas v20.4s, v20.4s\n"
+        "fcvtas v17.4s, v17.4s\n"
+        "fcvtas v19.4s, v19.4s\n"
+        "fcvtas v16.4s, v16.4s\n"
+        "fcvtas v18.4s, v18.4s\n"
+        "uzp1 v22.8h, v23.8h, v22.8h\n"
+        "uzp1 v20.8h, v21.8h, v20.8h\n"
+        "uzp1 v19.8h, v17.8h, v19.8h\n"
+        "uzp1 v18.8h, v16.8h, v18.8h\n"
+        "dup v16.8h, w20\n"
+        "add v22.8h, v22.8h, v16.8h\n"
+        "add v20.8h, v20.8h, v16.8h\n"
+        "add v19.8h, v19.8h, v16.8h\n"
+        "add v18.8h, v18.8h, v16.8h\n"
+        "movi v17.8h, #0x7f\n"
+        "mvni v16.8h, #0x7f\n"
+        "smin v22.8h, v22.8h, v17.8h\n"
+        "smin v20.8h, v20.8h, v17.8h\n"
+        "smin v19.8h, v19.8h, v17.8h\n"
+        "smin v18.8h, v18.8h, v17.8h\n"
+        "smax v22.8h, v22.8h, v16.8h\n"
+        "smax v20.8h, v20.8h, v16.8h\n"
+        "smax v19.8h, v19.8h, v16.8h\n"
+        "smax v18.8h, v18.8h, v16.8h\n"
+        "xtn v22.8b, v22.8h\n"
+        "xtn v20.8b, v20.8h\n"
+        "xtn v19.8b, v19.8h\n"
+        "xtn v18.8b, v18.8h\n"
+        "tbz %x[width], #3, 18f\n"
+        "str d22, [x25, #0x0]\n"
+        "add x25, x25, #0x8\n"
+        "str d19, [x22, #0x0]\n"
+        "add x22, x22, #0x8\n"
+        "tbz %x[width], #2, 16f\n"
+        "str s20, [x25], #0x4\n"
+        "str s18, [x22], #0x4\n"
+        "tbz %x[width], #1, 15f\n"
+        "st1 { v20.h }[2], [x25], #0x2\n"
+        "st1 { v18.h }[2], [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v20.b }[6], [x25], #0x1\n"
+        "st1 { v18.b }[6], [x22], #0x1\n"
+        "b 22f\n"
+        "15:" // tail loop: Main loop: unique 2: partial_0_12
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v20.b }[4], [x25], #0x1\n"
+        "st1 { v18.b }[4], [x22], #0x1\n"
+        "b 22f\n"
+        "16:" // tail loop: Main loop: unique 2: partial_1_8
+        "tbz %x[width], #1, 17f\n"
+        "str h20, [x25], #0x2\n"
+        "str h18, [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v20.b }[2], [x25], #0x1\n"
+        "st1 { v18.b }[2], [x22], #0x1\n"
+        "b 22f\n"
+        "17:" // tail loop: Main loop: unique 2: partial_0_8
+        "tbz %x[width], #0, 22f\n"
+        "str b20, [x25], #0x1\n"
+        "str b18, [x22], #0x1\n"
+        "b 22f\n"
+        "18:" // tail loop: Main loop: unique 2: partial_2_0
+        "tbz %x[width], #2, 20f\n"
+        "str s22, [x25], #0x4\n"
+        "str s19, [x22], #0x4\n"
+        "tbz %x[width], #1, 19f\n"
+        "st1 { v22.h }[2], [x25], #0x2\n"
+        "st1 { v19.h }[2], [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v22.b }[6], [x25], #0x1\n"
+        "st1 { v19.b }[6], [x22], #0x1\n"
+        "b 22f\n"
+        "19:" // tail loop: Main loop: unique 2: partial_0_4
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v22.b }[4], [x25], #0x1\n"
+        "st1 { v19.b }[4], [x22], #0x1\n"
+        "b 22f\n"
+        "20:" // tail loop: Main loop: unique 2: partial_1_0
+        "tbz %x[width], #1, 21f\n"
+        "str h22, [x25], #0x2\n"
+        "str h19, [x22], #0x2\n"
+        "tbz %x[width], #0, 22f\n"
+        "st1 { v22.b }[2], [x25], #0x1\n"
+        "st1 { v19.b }[2], [x22], #0x1\n"
+        "b 22f\n"
+        "21:" // tail loop: Main loop: unique 2: partial_0_0
+        "str b22, [x25], #0x1\n"
+        "str b19, [x22], #0x1\n"
+        "22:" // tail loop: Main loop: unique 2: Done
+        "23:" // tail loop: Main loop: No direct output
+        "mov v19.16b, v28.16b\n"
+        "mov v13.16b, v29.16b\n"
+        "fmla v19.4s, v8.4s, v24.4s\n"
+        "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
+        "mov v18.16b, v30.16b\n"
+        "mov v12.16b, v31.16b\n"
+        "fmla v13.4s, v7.4s, v25.4s\n"
+        "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
+        "mov v17.16b, v28.16b\n"
+        "mov v10.16b, v29.16b\n"
+        "fmla v18.4s, v6.4s, v26.4s\n"
+        "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
+        "mov v16.16b, v30.16b\n"
+        "mov v9.16b, v31.16b\n"
+        "fmla v12.4s, v5.4s, v27.4s\n"
+        "fmla v17.4s, v4.4s, v24.4s\n"
+        "fmla v10.4s, v3.4s, v25.4s\n"
+        "fmul v8.4s, v19.4s, v0.s[2]\n"
+        "fmla v16.4s, v2.4s, v26.4s\n"
+        "fmla v9.4s, v1.4s, v27.4s\n"
+        "fmul v7.4s, v13.4s, v0.s[2]\n"
+        "fmul v6.4s, v18.4s, v0.s[2]\n"
+        "fmul v5.4s, v12.4s, v0.s[2]\n"
+        "fmul v4.4s, v17.4s, v0.s[2]\n"
+        "fmul v3.4s, v10.4s, v0.s[2]\n"
+        "fmul v2.4s, v16.4s, v0.s[2]\n"
+        "fmul v1.4s, v9.4s, v0.s[2]\n"
+        "fcvtas v8.4s, v8.4s\n"
+        "fcvtas v7.4s, v7.4s\n"
+        "fcvtas v6.4s, v6.4s\n"
+        "fcvtas v5.4s, v5.4s\n"
+        "fcvtas v4.4s, v4.4s\n"
+        "fcvtas v3.4s, v3.4s\n"
+        "fcvtas v2.4s, v2.4s\n"
+        "fcvtas v1.4s, v1.4s\n"
+        "uzp1 v7.8h, v8.8h, v7.8h\n"
+        "uzp1 v5.8h, v6.8h, v5.8h\n"
+        "uzp1 v3.8h, v4.8h, v3.8h\n"
+        "uzp1 v1.8h, v2.8h, v1.8h\n"
+        "dup v16.8h, w22\n"
+        "add v7.8h, v7.8h, v16.8h\n"
+        "add v5.8h, v5.8h, v16.8h\n"
+        "add v3.8h, v3.8h, v16.8h\n"
+        "add v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w21\n"
+        "smin v7.8h, v7.8h, v16.8h\n"
+        "smin v5.8h, v5.8h, v16.8h\n"
+        "smin v3.8h, v3.8h, v16.8h\n"
+        "smin v1.8h, v1.8h, v16.8h\n"
+        "dup v16.8h, w20\n"
+        "smax v7.8h, v7.8h, v16.8h\n"
+        "smax v5.8h, v5.8h, v16.8h\n"
+        "smax v3.8h, v3.8h, v16.8h\n"
+        "smax v1.8h, v1.8h, v16.8h\n"
+        "xtn v7.8b, v7.8h\n"
+        "xtn v5.8b, v5.8h\n"
+        "xtn v3.8b, v3.8h\n"
+        "xtn v1.8b, v1.8h\n"
+        "tbz %x[width], #3, 27f\n"
+        "str d7, [x26, #0x0]\n"
+        "add x26, x26, #0x8\n"
+        "str d3, [x24, #0x0]\n"
+        "add x24, x24, #0x8\n"
+        "tbz %x[width], #2, 25f\n"
+        "str s5, [x26], #0x4\n"
+        "str s1, [x24], #0x4\n"
+        "tbz %x[width], #1, 24f\n"
+        "st1 { v5.h }[2], [x26], #0x2\n"
+        "st1 { v1.h }[2], [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v5.b }[6], [x26], #0x1\n"
+        "st1 { v1.b }[6], [x24], #0x1\n"
+        "b 31f\n"
+        "24:" // tail loop: unique 3: partial_0_12
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v5.b }[4], [x26], #0x1\n"
+        "st1 { v1.b }[4], [x24], #0x1\n"
+        "b 31f\n"
+        "25:" // tail loop: unique 3: partial_1_8
+        "tbz %x[width], #1, 26f\n"
+        "str h5, [x26], #0x2\n"
+        "str h1, [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v5.b }[2], [x26], #0x1\n"
+        "st1 { v1.b }[2], [x24], #0x1\n"
+        "b 31f\n"
+        "26:" // tail loop: unique 3: partial_0_8
+        "tbz %x[width], #0, 31f\n"
+        "str b5, [x26], #0x1\n"
+        "str b1, [x24], #0x1\n"
+        "b 31f\n"
+        "27:" // tail loop: unique 3: partial_2_0
+        "tbz %x[width], #2, 29f\n"
+        "str s7, [x26], #0x4\n"
+        "str s3, [x24], #0x4\n"
+        "tbz %x[width], #1, 28f\n"
+        "st1 { v7.h }[2], [x26], #0x2\n"
+        "st1 { v3.h }[2], [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v7.b }[6], [x26], #0x1\n"
+        "st1 { v3.b }[6], [x24], #0x1\n"
+        "b 31f\n"
+        "28:" // tail loop: unique 3: partial_0_4
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v7.b }[4], [x26], #0x1\n"
+        "st1 { v3.b }[4], [x24], #0x1\n"
+        "b 31f\n"
+        "29:" // tail loop: unique 3: partial_1_0
+        "tbz %x[width], #1, 30f\n"
+        "str h7, [x26], #0x2\n"
+        "str h3, [x24], #0x2\n"
+        "tbz %x[width], #0, 31f\n"
+        "st1 { v7.b }[2], [x26], #0x1\n"
+        "st1 { v3.b }[2], [x24], #0x1\n"
+        "b 31f\n"
+        "30:" // tail loop: unique 3: partial_0_0
+        "str b7, [x26], #0x1\n"
+        "str b3, [x24], #0x1\n"
+        "31:" // tail loop: unique 3: Done
+        "subs x23, x23, #0x2\n"
+        "bgt 6b\n"
+        "32:" // odd columns skip
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+}
+
+} // namespace
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_mul_add_s8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    const ITensorInfo *final_output_info = final_output->info();
+    const ITensorInfo *add_output_info   = (add_output != nullptr) ? add_output->info() : nullptr;
+    const ITensorInfo *input1_info       = input1->info();
+    const ITensorInfo *input2_info       = input2->info();
+
+    const size_t out_stride        = final_output_info->strides_in_bytes()[1];
+    const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0;
+    const size_t in0_stride        = input1_info->strides_in_bytes()[1];
+    const size_t in1_stride        = input2_info->strides_in_bytes()[1];
+
+    int8_t minval = std::numeric_limits<int8_t>::lowest();
+    int8_t maxval = std::numeric_limits<int8_t>::max();
+
+    const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    {
+        minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    {
+        minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
+        maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
+    }
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    {
+        minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
+        maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
+    }
+
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+
+    const int32_t in1_offset        = in1_qinfo.offset;
+    const int32_t in2_offset        = in2_qinfo.offset;
+    const int32_t out_offset        = final_output_qinfo.offset;
+    const int32_t out_direct_offset = add_output_qinfo.offset;
+
+    const float in1_scale        = in1_qinfo.scale;
+    const float in2_scale        = in2_qinfo.scale;
+    const float out_scale        = final_output_qinfo.scale;
+    const float out_direct_scale = add_output_qinfo.scale;
+
+    const float *bn_mul_buffer = reinterpret_cast<float *>(bn_mul->buffer());
+    const float *bn_add_buffer = reinterpret_cast<float *>(bn_add->buffer());
+
+    // Clear X & Y dimensions on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator in1_it(input1, window);
+    Iterator in2_it(input2, window);
+    Iterator out_it(final_output, window);
+
+    const size_t width  = window.num_iterations(0);
+    const size_t height = window.num_iterations(1);
+
+    if (add_output != nullptr)
+    {
+        Iterator add_out_it(add_output, window);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval,
+                    out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset,
+                    in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
+    }
+    else
+    {
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // __aarch64__
diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h
new file mode 100644
index 0000000000..568003a916
--- /dev/null
+++ b/src/cpu/kernels/addmuladd/list.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CPU_KERNELS_ADDMULADD_LIST
+#define SRC_CPU_KERNELS_ADDMULADD_LIST
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ADD_MUL_ADD_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \
+                   ITensor *add_output, ITensor *final_output, ConvertPolicy policy,                           \
+                   const ActivationLayerInfo &act_info, const Window &window)
+
+DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon);
+DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon);
+DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_u8_neon);
+DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_s8_neon);
+
+#undef DECLARE_ADD_MUL_ADD_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CPU_KERNELS_ADDMULADD_LIST */
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
new file mode 100644
index 0000000000..6e8f32ef47
--- /dev/null
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/NEON/INEKernel.h"
+#include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
+
+#include "gemm_common.hpp"
+
+namespace arm_compute
+{
+class ITensor;
+
+namespace cpu
+{
+namespace kernel
+{
+/** This class is a wrapper for the assembly kernels.
+  *
+  * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
+  * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
+  * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
+  * in the context of an NEFunctions.
+  *
+  * The type T is the type of the actual kernel implemented in assembly which is of type
+  *         template<typename To, typename Tr> class GemmCommon
+  *
+  *
+  */
+template <typename TypeInput, typename TypeOutput>
+class CpuGemmAssemblyWrapperKernel final : public INEKernel
+{
+public:
+    /** Constructor
+     */
+    CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
+    {
+    }
+
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)            = delete;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&)           = default;
+    CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
+
+    const char *name() const override
+    {
+        return _name.c_str();
+    }
+
+    void run(const Window &window, const ThreadInfo &info) override
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+        auto win = arm_gemm::to_ndcoord(window);
+
+        arm_gemm::ndcoord_t thread_locator{};
+
+        _kernel->execute(win, thread_locator, info.thread_id);
+    }
+
+    // Inherited methods overridden:
+    void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+        //convert between arm_compute and arm_gemm types
+        auto ndc_win = arm_gemm::to_ndcoord(window);
+        auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
+
+        _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
+    }
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in] kernel          Pointer to an assembly kernel implementation.
+     * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name.
+     */
+    void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
+        _kernel = kernel;
+
+        Window win = to_window(kernel->get_window_size());
+
+        INEKernel::configure(win);
+
+        if (!kernel_name_tag.empty())
+        {
+            _name += "/" + kernel_name_tag;
+        }
+    }
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws         Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override
+    {
+        ARM_COMPUTE_UNUSED(thread_count);
+        ARM_COMPUTE_UNUSED(platform);
+
+        return ICPPKernel::default_mws;
+    }
+
+private:
+    arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
+    std::string                                  _name;
+};
+} // namespace kernel
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
new file mode 100644
index 0000000000..941fed0ba8
--- /dev/null
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2018-2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP
+#define ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP
+
+#pragma once
+
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
+#include <cstring>
+#include <memory>
+#include <vector>
+
+namespace arm_gemm
+{
+enum class GemmMethod
+{
+    DEFAULT,
+    GEMV_BATCHED,
+    GEMV_PRETRANSPOSED,
+    GEMV_NATIVE_TRANSPOSED,
+    GEMM_NATIVE,
+    GEMM_HYBRID,
+    GEMM_INTERLEAVED,
+    GEMM_INTERLEAVED_2D,
+    QUANTIZE_WRAPPER,
+    QUANTIZE_WRAPPER_2D,
+    GEMM_HYBRID_QUANTIZED
+};
+
+enum class WeightFormat
+{
+    UNSPECIFIED    = 0x1,
+    ANY            = 0x2,
+    OHWI           = 0x100100,
+    OHWIo2         = 0x100200,
+    OHWIo4         = 0x100400,
+    OHWIo8         = 0x100800,
+    OHWIo16        = 0x101000,
+    OHWIo32        = 0x102000,
+    OHWIo64        = 0x104000,
+    OHWIo128       = 0x108000,
+    OHWIo4i2       = 0x200400,
+    OHWIo4i2_bf16  = 0x200410,
+    OHWIo8i2       = 0x200800,
+    OHWIo8i2_bf16  = 0x200810,
+    OHWIo16i2      = 0x201000,
+    OHWIo16i2_bf16 = 0x201010,
+    OHWIo32i2      = 0x202000,
+    OHWIo32i2_bf16 = 0x202010,
+    OHWIo64i2      = 0x204000,
+    OHWIo64i2_bf16 = 0x204010,
+    OHWIo4i4       = 0x400400,
+    OHWIo4i4_bf16  = 0x400410,
+    OHWIo8i4       = 0x400800,
+    OHWIo8i4_bf16  = 0x400810,
+    OHWIo16i4      = 0x401000,
+    OHWIo16i4_bf16 = 0x401010,
+    OHWIo32i4      = 0x402000,
+    OHWIo32i4_bf16 = 0x402010,
+    OHWIo64i4      = 0x404000,
+    OHWIo64i4_bf16 = 0x404010,
+    OHWIo2i8       = 0x800200,
+    OHWIo4i8       = 0x800400,
+    OHWIo8i8       = 0x800800,
+    OHWIo16i8      = 0x801000,
+    OHWIo32i8      = 0x802000,
+    OHWIo64i8      = 0x804000
+};
+
+struct KernelDescription
+{
+    GemmMethod  method         = GemmMethod::DEFAULT;
+    std::string name           = "";
+    bool        is_default     = false;
+    uint64_t    cycle_estimate = 0;
+
+    KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0)
+        : method(m), name(n), is_default(d), cycle_estimate(c)
+    {
+    }
+    KernelDescription() noexcept
+    {
+    }
+};
+
+struct GemmConfig
+{
+    GemmMethod   method           = GemmMethod::DEFAULT;
+    std::string  filter           = "";
+    unsigned int inner_block_size = 0;
+    unsigned int outer_block_size = 0;
+    WeightFormat weight_format    = WeightFormat::ANY;
+
+    GemmConfig(GemmMethod method) : method(method)
+    {
+    }
+    GemmConfig()
+    {
+    }
+};
+
+struct Activation
+{
+    enum class Type
+    {
+        None,
+        ReLU,
+        BoundedReLU
+    };
+
+    Type  type;
+    float param1;
+    float param2;
+
+    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2)
+    {
+    }
+};
+
+struct GemmArgs
+{
+public:
+    const CPUInfo    *_ci;
+    unsigned int      _Msize; // num of tiles
+    unsigned int      _Nsize; // output channels
+    unsigned int      _Ksize; // input channels
+    unsigned int      _Ksections;
+    unsigned int      _nbatches;
+    unsigned int      _nmulti; // n_gemms to be performed
+    bool              _indirect_input;
+    Activation        _act;
+    int               _maxthreads;
+    bool              _fixed_format;
+    bool              _fast_mode;
+    bool              _accumulate;
+    const GemmConfig *_cfg;
+
+    GemmArgs(const CPUInfo    *ci,
+             unsigned int      M,
+             unsigned int      N,
+             unsigned int      K,
+             unsigned int      Ksections,
+             unsigned int      nbatches,
+             unsigned int      nmulti,
+             bool              indirect_input,
+             Activation        act,
+             const int         maxthreads,
+             bool              fixed_format = false,
+             bool              fast_mode    = false,
+             bool              accumulate   = false,
+             const GemmConfig *cfg          = nullptr)
+        : _ci(ci),
+          _Msize(M),
+          _Nsize(N),
+          _Ksize(K),
+          _Ksections(Ksections),
+          _nbatches(nbatches),
+          _nmulti(nmulti),
+          _indirect_input(indirect_input),
+          _act(act),
+          _maxthreads(maxthreads),
+          _fixed_format(fixed_format),
+          _fast_mode(fast_mode),
+          _accumulate(accumulate),
+          _cfg(cfg)
+    {
+    }
+};
+
+struct Requantize32
+{
+public:
+    const int32_t *bias                     = nullptr;
+    size_t         bias_multi_stride        = 0;
+    int32_t        a_offset                 = 0;
+    int32_t        b_offset                 = 0;
+    int32_t        c_offset                 = 0;
+    bool           per_channel_requant      = false;
+    int32_t        per_layer_left_shift     = 0;
+    int32_t        per_layer_right_shift    = 0;
+    int32_t        per_layer_mul            = 0;
+    const int32_t *per_channel_left_shifts  = nullptr;
+    const int32_t *per_channel_right_shifts = nullptr;
+    const int32_t *per_channel_muls         = nullptr;
+    int32_t        minval                   = 0;
+    int32_t        maxval                   = 0;
+
+    Requantize32() = default;
+
+    // Constructor for per-tensor quantization
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
+                 int32_t        requant_shift,
+                 int32_t        requant_mul,
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(false),
+          per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)),
+          per_layer_mul(requant_mul),
+          minval(minv),
+          maxval(maxv)
+    {
+    }
+
+    // Constructor for per-channel quantization
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
+                 const int32_t *requant_left_shifts,
+                 const int32_t *requant_right_shifts,
+                 const int32_t *requant_muls,
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(true),
+          per_channel_left_shifts(requant_left_shifts),
+          per_channel_right_shifts(requant_right_shifts),
+          per_channel_muls(requant_muls),
+          minval(minv),
+          maxval(maxv)
+    {
+    }
+};
+
+struct DequantizeFloat
+{
+public:
+    float scale = 0;
+
+    DequantizeFloat() = default;
+
+    // Constructor
+    DequantizeFloat(const float scale) : scale(scale)
+    {
+    }
+};
+
+struct Nothing
+{
+};
+
+template <typename Top, typename Tret>
+using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
+
+/* Low level API calls.
+ * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
+
+/* get_gemm_method(): Given the templated types and provided parameters,
+ * which is the preferred method to implement this GEMM?  */
+template <typename Top, typename Tret, class OutputStage = Nothing>
+KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
+
+template <typename Top, typename Tret, class OutputStage = Nothing>
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
+
+template <typename Top, typename Tret, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
+
+template <typename Top, typename Tret, class OutputStage = Nothing>
+bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const OutputStage & = {});
+
+} // namespace arm_gemm
+
+#endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP
diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
new file mode 100644
index 0000000000..0672e899b6
--- /dev/null
+++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Window.h"
+
+#include "ndrange.hpp"
+#include <cassert>
+
+/* This file contains mapping between integral types used in arm_compute and arm_gemm
+ * These two codebases both require a degree of separation for the sake of modularity
+ * so maintain their own types which represent similar information.
+ */
+
+namespace arm_gemm
+{
+//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
+constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+
+using ndrange_t = NDRange<ndrange_max>;
+using ndcoord_t = NDCoordinate<ndrange_max>;
+
+/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
+ *
+ * As `NDRange<T>` does not not encode start positions, we specify
+ * the start to be zero in the produced `arm_compute::Window`
+ *
+ * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
+ * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
+ */
+inline arm_compute::Window to_window(const ndrange_t &ndr)
+{
+    arm_compute::Window win;
+
+    for (unsigned int i = 0; i != ndrange_max; ++i)
+    {
+        //populate the window with the dimensions of the NDRange
+        win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
+    }
+
+    return win;
+}
+
+/*
+ * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
+ *
+ * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
+ * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
+ */
+inline arm_compute::Window to_window(const ndcoord_t &ndc)
+{
+    arm_compute::Window win;
+
+    for (unsigned int i = 0; i != ndrange_max; ++i)
+    {
+        const auto start = ndc.get_position(i);
+        const auto size  = ndc.get_size(i);
+        const auto stop  = start + size;
+
+        //populate the window with the dimensions of the NDRange
+        win.set(i, arm_compute::Window::Dimension(start, stop));
+    }
+
+    return win;
+}
+
+/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
+ *
+ * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
+ * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
+ *
+ * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
+ * @return the resultant ndrange_t
+ */
+inline ndrange_t to_ndrange(const arm_compute::Window &win)
+{
+    return {static_cast<unsigned int>(win[0].end() - win[0].start()),
+            static_cast<unsigned int>(win[1].end() - win[1].start()),
+            static_cast<unsigned int>(win[2].end() - win[2].start()),
+            static_cast<unsigned int>(win[3].end() - win[3].start()),
+            static_cast<unsigned int>(win[4].end() - win[4].start()),
+            static_cast<unsigned int>(win[5].end() - win[5].start())};
+}
+
+/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
+ *
+ * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
+ * @return the resultant ndcoord_t
+ */
+inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
+{
+    return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())},
+            {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())},
+            {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())},
+            {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())},
+            {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())},
+            {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}};
+}
+
+} //namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/arm_gemm_local.hpp b/src/cpu/kernels/assembly/arm_gemm_local.hpp
new file mode 100644
index 0000000000..78e0adf31f
--- /dev/null
+++ b/src/cpu/kernels/assembly/arm_gemm_local.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+using CPUModel = arm_compute::CPUModel;
+using CPUInfo  = arm_compute::CPUInfo;
diff --git a/src/cpu/kernels/assembly/convolution_parameters.hpp b/src/cpu/kernels/assembly/convolution_parameters.hpp
new file mode 100644
index 0000000000..0c1ae58902
--- /dev/null
+++ b/src/cpu/kernels/assembly/convolution_parameters.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+/*
+ * Parameter set for "convolution" type GEMM.
+ *
+ * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if
+ * an im2row had been performed on the input tensor to generate the operand
+ * matrix, but instead this structure describes the convolution parameters
+ * such that this can be done on the fly.
+ *
+ * The parameters describe the convolution details - the notional shape of
+ * the input and output tensors, whether padding is to be applied, the size
+ * of the kernel and a constant value to be used for padding (needed for
+ * quantized tensors).
+ *
+ * The second part describes the layout of the input tensor in memory, which
+ * is assumed to be in NHWC format.  This consists of a base pointer and
+ * strides for columns, rows and batches.  'multis' are not supported for
+ * convolution type GEMMs.
+ */
+struct ConvolutionParameters
+{
+    int64_t input_width;
+    int64_t input_height;
+    int64_t input_channels;
+    int64_t kernel_width;
+    int64_t kernel_height;
+    int64_t output_width;
+    int64_t output_height;
+    int64_t output_stride_w;
+    int64_t output_stride_h;
+    //          output_channels not included as they do not affect the input.
+    int64_t padding_top;
+    int64_t padding_left;
+    float   padding_value;
+};
+
+} // namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
new file mode 100644
index 0000000000..45d1e43274
--- /dev/null
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2017-2021,2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_COMMON_HPP
+#define ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_COMMON_HPP
+
+#pragma once
+
+#include "convolution_parameters.hpp"
+#include "ndrange.hpp"
+#include <cstddef>
+
+namespace arm_gemm
+{
+// Avoid circular dependency with arm_gemm.hpp
+struct GemmConfig;
+
+// Abstract class for the GEMM/GEMV functions.
+//
+// GEMM implementations may be "native" (never require any input
+// permutation), "pretransposed" (require permutation up-front) or require
+// working space (permute as they go along).  This interface should support
+// all of them.
+
+// The real GemmCommon class is templated based on the operand and return
+// type.  This is an interface class which is independent of those types.
+class IGemmCommon
+{
+public:
+    /* Pass in the pointers to the arrays to be operated on and their
+     * strides.  This "generic" version uses void *s, the preferred version
+     * is the one provided by templated GemmCommon (below) which takes
+     * appropriately typed pointers.  If B is pretransposed (see below) then
+     * the settings for B here are ignored.
+     */
+    virtual void set_arrays_generic(const void                                   *A,
+                                    const int                                     lda,
+                                    const int                                     A_batch_stride,
+                                    const int                                     A_multi_stride,
+                                    const void                                   *B,
+                                    const int                                     ldb,
+                                    /* batches share B */ const int               B_multi_stride,
+                                    void                                         *C,
+                                    const int                                     ldc,
+                                    const int                                     C_batch_stride,
+                                    const int                                     C_multi_stride,
+                                    const void                                   *bias,
+                                    /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+
+    /** @returns an ndrange containing ranges of the compute space which can be
+     * broken up and parallelised over
+     */
+    virtual ndrange_t get_window_size() const = 0;
+
+    /* The maximum thread count is specified when the GEMM is created.  Some
+     * implementations need to know how many threads will actually run in
+     * order to work properly.
+     *
+     * In some cases, after creating the GEMM the number of threads needs to
+     * be reduced (e.g. not enough work to split across threads).  This
+     * method allows the number of actual threads to be run to be set (must
+     * be equal or lower).
+     *
+     * This has an empty default implementation, as GEMMs which don't care
+     * about thread count can safely ignore this.
+     */
+    virtual void set_nthreads(int){};
+
+    /* Whether this GEMM can be dynamically scheduled or not. */
+    virtual bool supports_dynamic_scheduling() const
+    {
+        return false;
+    }
+
+    /** Main execute member fucntion
+     * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
+     * @param [in] thread_locator where are we inside of the thread space
+     * @param [in] threadid       a unique threadid
+     */
+    virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
+
+    /*** Working space interface (optional) ***/
+    /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
+    virtual size_t get_working_size() const
+    {
+        return 0;
+    }
+    /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
+    virtual void set_working_space(void *){};
+
+    /*** "Pretransposed" interface (optional) ***/
+    /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
+    virtual bool B_is_pretransposed() const
+    {
+        return false;
+    }
+    /* Does pretranspose still need to be done? */
+    virtual bool B_pretranspose_required() const
+    {
+        return false;
+    }
+    /* Does pretranspose accept the transposed flag? */
+    virtual bool B_pretranspose_supports_transpose() const
+    {
+        return false;
+    }
+    /* Total number of bytes of space needed for pretransposed arrays. */
+    virtual size_t get_B_pretransposed_array_size() const
+    {
+        return 0;
+    }
+    /* Amount of work for the threaded cases */
+    virtual size_t get_B_pretranspose_window_size() const
+    {
+        return 1;
+    }
+    /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
+    /* The "real" version of this depends on the templated operand type (see below).  */
+    virtual void pretranspose_B_array_generic(void *, const void *, const int, const int, bool) = 0;
+    /* Threaded version with window start/end parameters */
+    virtual void
+    pretranspose_B_array_part_generic(void *, const void *, const int, const int, bool, const size_t, const size_t) = 0;
+
+    /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
+    virtual void set_pretransposed_B_data(void *)
+    {
+    }
+
+    /*** "Quantized bias" interface (optional) ***/
+    /* Set the bias vector for quantized GEMMs */
+    virtual void set_quantized_bias(const int32_t *, size_t)
+    {
+    }
+
+    /*** Indirect interface (optional) ***/
+    /* Set the indirect table.  This comprises a number of values per kernel point, and a densely packed array of pointers,
+     * multis * batches * kernel_points */
+    virtual void set_indirect_parameters_generic(size_t, const void *const *const *)
+    {
+    }
+
+    /*** Convolution interface (optional) ***/
+    /* Set the convolution parameters. */
+    virtual void set_convolution_parameters(ConvolutionParameters)
+    {
+    }
+
+    /*** Dequanize scale interface (optional) ***/
+    /* Set the dequantize scale for GEMMs when converting from int to float (float out = scale * float(int out) ) */
+    virtual void set_dequantize_scale(const float)
+    {
+    }
+
+    /*** Introspection interface ***/
+    /* Get the configuration of this GEMM */
+    virtual GemmConfig get_config() = 0;
+
+    // Destructor
+    virtual ~IGemmCommon()
+    {
+    }
+};
+
+/* "Real" GemmCommon class which is templated on the operand and return types.
+ *
+ * In addition to correctly typed versions of the functions that operate on
+ * operand and return data, this class provides a default implementation of
+ * 'set_arrays' to capture the provided arguments in protected class
+ * members, as essentially any implementation will need these.
+ */
+template <typename To, typename Tr>
+class GemmCommon : public IGemmCommon
+{
+protected:
+    const To *_Aptr              = nullptr;
+    int       _lda               = 0;
+    int       _A_batch_stride    = 0;
+    int       _A_multi_stride    = 0;
+    const To *_Bptr              = nullptr;
+    int       _ldb               = 0;
+    int       _B_multi_stride    = 0;
+    Tr       *_Cptr              = nullptr;
+    int       _ldc               = 0;
+    int       _C_batch_stride    = 0;
+    int       _C_multi_stride    = 0;
+    const Tr *_bias              = nullptr;
+    int       _bias_multi_stride = 0;
+
+public:
+    /* Pass in the pointers to the arrays to be operated on and their
+     * strides (templated version with appropriate types). */
+    virtual void set_arrays(const To                                     *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const To                                     *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            Tr                                           *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const Tr                                     *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride)
+    {
+        _Aptr              = A;
+        _lda               = lda;
+        _A_batch_stride    = A_batch_stride;
+        _A_multi_stride    = A_multi_stride;
+        _Bptr              = B;
+        _ldb               = ldb;
+        _B_multi_stride    = B_multi_stride;
+        _Cptr              = C;
+        _ldc               = ldc;
+        _C_batch_stride    = C_batch_stride;
+        _C_multi_stride    = C_multi_stride;
+        _bias              = bias;
+        _bias_multi_stride = bias_multi_stride;
+    }
+
+    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
+    void set_arrays_generic(const void                                   *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const void                                   *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            void                                         *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const void                                   *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride) override
+    {
+        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+                   B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
+                   static_cast<const Tr *>(bias), bias_multi_stride);
+    }
+
+    /*** "Pretransposed" interface ***/
+
+    /* Compute col sums over all columns */
+    virtual void requantize_bias(void *, const To *, const int, const int){};
+
+    /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
+    /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
+    virtual void pretranspose_B_array(void *, const To *, const int, const int, bool){};
+
+    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
+    void pretranspose_B_array_generic(
+        void *out, const void *in, const int row_stride, const int multi_stride, bool transposed) override
+    {
+        pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride, transposed);
+    }
+
+    /* Threaded versions of the above.
+     * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
+     * just calls the non-threaded functions to do the work.  This is valid as with window size of 1 the only
+     * legal values for start and end are 0 and 1 respectively. */
+    virtual void pretranspose_B_array_part(
+        void *out, const To *in, const int row_stride, const int multi_stride, bool transposed, size_t, size_t)
+    {
+        pretranspose_B_array(out, in, row_stride, multi_stride, transposed);
+    };
+
+    void pretranspose_B_array_part_generic(void       *out,
+                                           const void *in,
+                                           const int   row_stride,
+                                           const int   multi_stride,
+                                           bool        transposed,
+                                           size_t      start,
+                                           size_t      end) override
+    {
+        pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, transposed, start, end);
+    }
+
+    /*** Indirect interface ***/
+    virtual void set_indirect_parameters(size_t, const To *const *const *)
+    {
+    }
+
+    void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override
+    {
+        set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr));
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_COMMON_HPP
diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp
new file mode 100644
index 0000000000..baccdc0d88
--- /dev/null
+++ b/src/cpu/kernels/assembly/ndrange.hpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <initializer_list>
+
+namespace arm_gemm
+{
+template <unsigned int D>
+class NDRange
+{
+private:
+    std::array<unsigned int, D> m_sizes{};
+    std::array<unsigned int, D> m_totalsizes{};
+
+    class NDRangeIterator
+    {
+    private:
+        const NDRange &m_parent;
+        unsigned int   m_pos = 0;
+        unsigned int   m_end = 0;
+
+    public:
+        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e)
+        {
+        }
+
+        bool done() const
+        {
+            return (m_pos >= m_end);
+        }
+
+        unsigned int dim(unsigned int d) const
+        {
+            unsigned int r = m_pos;
+
+            if (d < (D - 1))
+            {
+                r %= m_parent.m_totalsizes[d];
+            }
+
+            if (d > 0)
+            {
+                r /= m_parent.m_totalsizes[d - 1];
+            }
+
+            return r;
+        }
+
+        bool next_dim0()
+        {
+            m_pos++;
+
+            return !done();
+        }
+
+        bool next_dim1()
+        {
+            m_pos += m_parent.m_sizes[0] - dim(0);
+
+            return !done();
+        }
+
+        unsigned int dim0_max() const
+        {
+            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
+
+            return dim(0) + offset;
+        }
+    };
+
+    void set_totalsizes()
+    {
+        unsigned int t = 1;
+
+        for (unsigned int i = 0; i < D; i++)
+        {
+            if (m_sizes[i] == 0)
+            {
+                m_sizes[i] = 1;
+            }
+
+            t *= m_sizes[i];
+
+            m_totalsizes[i] = t;
+        }
+    }
+
+public:
+    NDRange &operator=(const NDRange &rhs) = default;
+    NDRange(const NDRange &rhs)            = default;
+
+    template <typename... T>
+    NDRange(T... ts) : m_sizes{ts...}
+    {
+        set_totalsizes();
+    }
+
+    NDRange(const std::array<unsigned int, D> &n) : m_sizes(n)
+    {
+        set_totalsizes();
+    }
+
+    NDRangeIterator iterator(unsigned int start, unsigned int end) const
+    {
+        return NDRangeIterator(*this, start, end);
+    }
+
+    unsigned int total_size() const
+    {
+        return m_totalsizes[D - 1];
+    }
+
+    unsigned int get_size(unsigned int v) const
+    {
+        return m_sizes[v];
+    }
+};
+
+/** NDCoordinate builds upon a range, but specifies a starting position
+ * in addition to a size which it inherits from NDRange
+ */
+template <unsigned int N>
+class NDCoordinate : public NDRange<N>
+{
+    using int_t     = unsigned int;
+    using ndrange_t = NDRange<N>;
+
+    std::array<int_t, N> m_positions{};
+
+public:
+    NDCoordinate &operator=(const NDCoordinate &rhs) = default;
+    NDCoordinate(const NDCoordinate &rhs)            = default;
+    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list)
+    {
+        std::array<int_t, N> sizes{};
+
+        std::size_t i = 0;
+        for (auto &p : list)
+        {
+            m_positions[i] = p.first;
+            sizes[i++]     = p.second;
+        }
+
+        //update the parents sizes
+        static_cast<ndrange_t &>(*this) = ndrange_t(sizes);
+    }
+
+    int_t get_position(int_t d) const
+    {
+        assert(d < N);
+
+        return m_positions[d];
+    }
+
+    void set_position(int_t d, int_t v)
+    {
+        assert(d < N);
+
+        m_positions[d] = v;
+    }
+
+    int_t get_position_end(int_t d) const
+    {
+        return get_position(d) + ndrange_t::get_size(d);
+    }
+}; //class NDCoordinate
+
+using ndrange_t = NDRange<6>;
+using ndcoord_t = NDCoordinate<6>;
+
+} // namespace arm_gemm
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..dbdec5fb50
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
+{
+    return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..0224b3406a
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
+{
+    return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
new file mode 100644
index 0000000000..5a2939b587
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+
+#include "src/cpu/CpuTypes.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
+
+{
+    const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
+    const size_t deltas_width = deltas->info()->tensor_shape()[0];
+    const int    img_h        = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
+    const int    img_w        = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
+
+    const auto scale_after  = (bbinfo.apply_scale() ? bbinfo.scale() : 1.f);
+    const auto scale_before = bbinfo.scale();
+    const auto offset       = (bbinfo.correct_transform_coords() ? 1.f : 0.f);
+
+    auto pred_ptr =
+        reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
+
+    const auto boxes_qinfo  = boxes->info()->quantization_info().uniform();
+    const auto deltas_qinfo = deltas->info()->quantization_info().uniform();
+    const auto pred_qinfo   = pred_boxes->info()->quantization_info().uniform();
+
+    Iterator box_it(boxes, window);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
+            const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
+            const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
+            const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
+            const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
+            const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
+            const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
+            const float ctr_x  = (b0 / scale_before) + 0.5f * width;
+            const float ctr_y  = (b1 / scale_before) + 0.5f * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
+                const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
+                float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
+                float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
+                // Clip dw and dh
+                dw = std::min(dw, bbinfo.bbox_xform_clip());
+                dh = std::min(dh, bbinfo.bbox_xform_clip());
+                // Determine the predictions
+                const float pred_ctr_x = dx * width + ctr_x;
+                const float pred_ctr_y = dy * height + ctr_y;
+                const float pred_w     = std::exp(dw) * width;
+                const float pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 1] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 2] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f),
+                    pred_qinfo);
+                pred_ptr[delta_id + 3] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f),
+                    pred_qinfo);
+            }
+        },
+        box_it);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
new file mode 100644
index 0000000000..d8013c6227
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void bounding_box_transform(const ITensor           *boxes,
+                            ITensor                 *pred_boxes,
+                            const ITensor           *deltas,
+                            BoundingBoxTransformInfo bbinfo,
+                            const Window            &window)
+{
+    const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
+    const size_t deltas_width = deltas->info()->tensor_shape()[0];
+    const int    img_h        = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
+    const int    img_w        = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
+
+    const auto scale_after  = (bbinfo.apply_scale() ? T(bbinfo.scale()) : T(1));
+    const auto scale_before = T(bbinfo.scale());
+    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
+    const auto offset = (bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
+
+    auto pred_ptr  = reinterpret_cast<T *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
+
+    Iterator box_it(boxes, window);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
+            const auto b0     = *ptr;
+            const auto b1     = *(ptr + 1);
+            const auto b2     = *(ptr + 2);
+            const auto b3     = *(ptr + 3);
+            const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
+            const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
+            const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
+            const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
+                const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
+                T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
+                T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
+                // Clip dw and dh
+                dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
+                dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
+                // Determine the predictions
+                const T pred_ctr_x = dx * width + ctr_x;
+                const T pred_ctr_y = dy * height + ctr_y;
+                const T pred_w     = std::exp(dw) * width;
+                const T pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 1] =
+                    scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
+                pred_ptr[delta_id + 2] =
+                    scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 3] =
+                    scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
+            }
+        },
+        box_it);
+}
+
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
new file mode 100644
index 0000000000..64ef815195
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
+{
+    return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h
new file mode 100644
index 0000000000..4da725a257
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/list.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H
+#define SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name)                                                                 \
+    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \
+                   const Window &window)
+DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform);
+DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform);
+DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform);
+#undef DECLARE_BOUNDINGBOXTRANFORM_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..2897f4b242
--- /dev/null
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/cpu/kernels/cast/list.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
+#include "support/SaturateCast.h"
+
+#include "arm_neon.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_signed_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(_policy);
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON(_src == _dst);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator src(_src, win);
+    Iterator dst(_dst, win);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+            int        x       = window_start_x;
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+
+                const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
+}
+
+void neon_s32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(_policy);
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON(_src == _dst);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator src(_src, win);
+    Iterator dst(_dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const float32x4x4_t texels = {
+                    {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
+                     vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}};
+
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
+}
+
+void neon_fp32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(_policy);
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON(_src == _dst);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator src(_src, win);
+    Iterator dst(_dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4),
+                                               vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}};
+
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
+}
+
+void neon_fp16_to_other_dt_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(_policy);
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON(_src == _dst);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator src(_src, win);
+    Iterator dst(_dst, win);
+    switch (_dst->info()->data_type())
+    {
+        case DataType::QASYMM8_SIGNED:
+        {
+            /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
+                {
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                    {
+                        const float16x8x2_t texels = {{
+                            vld1q_f16(src_ptr + x),
+                            vld1q_f16(src_ptr + x + 8),
+                        }};
+
+                        vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
+            break;
+        }
+        case DataType::QASYMM8:
+        case DataType::U8:
+        {
+            /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
+                {
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                    {
+                        const float16x8x2_t texels = {{
+                            vld1q_f16(src_ptr + x),
+                            vld1q_f16(src_ptr + x + 8),
+                        }};
+
+                        vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
+            break;
+        }
+        case DataType::F32:
+        {
+            /* Up-conversion F16 -> F32 */
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
+                {
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                    {
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+                        vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
+                        vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
+            break;
+        }
+        case DataType::S32:
+        {
+            /* Up-conversion F16 -> S32 */
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
+                {
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                    {
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+
+                        vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
+                        vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("dst data type not supported");
+    }
+}
+
+void neon_u8_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(_policy);
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON(_src == _dst);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator src(_src, win);
+    Iterator dst(_dst, win);
+    /* Up-conversion U8 -> F16 */
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
+    return;
+}
+
+} // namespace cpu
+} // namespace arm_compute
+#endif /* #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h
new file mode 100644
index 0000000000..5e634fc170
--- /dev/null
+++ b/src/cpu/kernels/cast/list.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CAST_LIST_H
+#define SRC_CORE_NEON_KERNELS_CAST_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_CAST_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \
+                   const Window &window)
+
+DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast);
+DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast);
+DECLARE_CAST_KERNEL(neon_fp16_to_other_dt_cast);
+DECLARE_CAST_KERNEL(neon_s32_to_fp16_cast);
+DECLARE_CAST_KERNEL(neon_qasymm8_signed_to_fp16_cast);
+DECLARE_CAST_KERNEL(neon_fp32_to_bfloat16_cast);
+DECLARE_CAST_KERNEL(neon_bfloat16_to_fp32_cast);
+
+#undef DECLARE_CAST_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h
new file mode 100644
index 0000000000..082c60be29
--- /dev/null
+++ b/src/cpu/kernels/conv3d/neon/list.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
+#define SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/conv3d/neon/quantized.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void directconv3d_float_neon_ndhwc(const ITensor    *src0,
+                                   const ITensor    *src1,
+                                   const ITensor    *src2,
+                                   ITensor          *dst,
+                                   const Conv3dInfo &conv_info,
+                                   const Window     &window)
+{
+    const ITensor *src     = src0;
+    const ITensor *weights = src1;
+    const ITensor *biases  = src2;
+
+    using vtype                                = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type                          = typename vtype::type;
+    using tag_type                             = typename vtype::tag_type;
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    // Scalar quantities (N D H W Cin)
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_d = src->info()->strides_in_bytes()[3] / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[4] / element_size;
+    const int input_dim_w    = src->info()->dimension(1);
+    const int input_dim_h    = src->info()->dimension(2);
+    const int input_dim_d    = src->info()->dimension(3);
+
+    // Kernel info (D H W Cin Cout)
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes()[2] / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes()[3] / element_size;
+    const unsigned int kernel_stride_d = weights->info()->strides_in_bytes()[4] / element_size;
+    const int          kernel_dim_w    = weights->info()->dimension(2);
+    const int          kernel_dim_h    = weights->info()->dimension(3);
+    const int          kernel_dim_d    = weights->info()->dimension(4);
+
+    // Convolution padding and stride
+    const int conv_pad_top   = conv_info.padding.top;
+    const int conv_pad_left  = conv_info.padding.left;
+    const int conv_pad_front = conv_info.padding.front;
+    const int conv_stride_w  = conv_info.stride.width;
+    const int conv_stride_h  = conv_info.stride.height;
+    const int conv_stride_d  = conv_info.stride.depth;
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimW, Window::Dimension(0, 1, 1));
+    window_w.set(4, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    const T *biases_ptr = nullptr;
+    if (biases != nullptr)
+    {
+        biases_ptr = reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
+    }
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
+
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
+            * This is the loop in the weights, and it goes along OFM (output feature map)
+            */
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    T          out_temp          = static_cast<T>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
+                    {
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
+                        {
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
+                            {
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
+                            }
+                        }
+                    }
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) =
+                        (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+                },
+                wei);
+        },
+        out);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h
new file mode 100644
index 0000000000..f0fc9b5a71
--- /dev/null
+++ b/src/cpu/kernels/conv3d/neon/quantized.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
+#define SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void directconv3d_quantized_neon_ndhwc(const ITensor    *src0,
+                                       const ITensor    *src1,
+                                       const ITensor    *src2,
+                                       ITensor          *dst,
+                                       const Conv3dInfo &conv_info,
+                                       const Window     &window)
+{
+    const ITensor *src     = src0;
+    const ITensor *weights = src1;
+    const ITensor *biases  = src2;
+
+    using vtype                                = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type                          = typename vtype::type;
+    using tag_type                             = typename vtype::tag_type;
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+    using q16_t                                = typename wrapper::traits::promote_t<T>;
+    using q32_t                                = typename wrapper::traits::promote_t<q16_t>;
+    using q32x4_t                              = typename wrapper::traits::neon_vector<q32_t, 4>::type;
+
+    const int32_t input_offset   = -src->info()->quantization_info().uniform().offset;
+    const float   input_scale    = src->info()->quantization_info().uniform().scale;
+    const int32_t weights_offset = -weights->info()->quantization_info().uniform().offset;
+    const float   weights_scale  = weights->info()->quantization_info().uniform().scale;
+    const int32_t output_offset  = dst->info()->quantization_info().uniform().offset;
+    const float   output_scale   = dst->info()->quantization_info().uniform().scale;
+
+    int32_t     output_multiplier = 0;
+    int32_t     output_shift      = 0;
+    const float multiplier        = input_scale * weights_scale / output_scale;
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+    // Scalar quantities (N D H W Cin)
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_d = src->info()->strides_in_bytes()[3] / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[4] / element_size;
+    const int input_dim_w    = src->info()->dimension(1);
+    const int input_dim_h    = src->info()->dimension(2);
+    const int input_dim_d    = src->info()->dimension(3);
+
+    // Kernel info (D H W Cin Cout)
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes()[2] / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes()[3] / element_size;
+    const unsigned int kernel_stride_d = weights->info()->strides_in_bytes()[4] / element_size;
+    const int          kernel_dim_w    = weights->info()->dimension(2);
+    const int          kernel_dim_h    = weights->info()->dimension(3);
+    const int          kernel_dim_d    = weights->info()->dimension(4);
+
+    // Convolution padding and stride
+    const int conv_pad_top   = conv_info.padding.top;
+    const int conv_pad_left  = conv_info.padding.left;
+    const int conv_pad_front = conv_info.padding.front;
+    const int conv_stride_w  = conv_info.stride.width;
+    const int conv_stride_h  = conv_info.stride.height;
+    const int conv_stride_d  = conv_info.stride.depth;
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimW, Window::Dimension(0, 1, 1));
+    window_w.set(4, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    const int32_t *biases_ptr = nullptr;
+    if (biases != nullptr)
+    {
+        biases_ptr = reinterpret_cast<int32_t *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
+    }
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
+
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
+            * This is the loop in the weights, and it goes along OFM (output feature map)
+            */
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    int32_t    acc               = static_cast<int32_t>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
+                    {
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
+                        {
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
+                            {
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
+
+                                q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+
+                                    q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+
+                                    const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
+                                    const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
+                                    const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
+                                    const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
+
+                                    src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
+                                    src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
+                                    src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
+                                    src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+
+                                    wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
+                                    wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
+                                    wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
+                                    wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+
+                                    acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
+                                    acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
+                                    acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
+                                    acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
+                                }
+#if defined(__aarch64__)
+                                acc += wrapper::vaddv(acc_q32_0);
+                                acc += wrapper::vaddv(acc_q32_1);
+                                acc += wrapper::vaddv(acc_q32_2);
+                                acc += wrapper::vaddv(acc_q32_3);
+#else // __aarch64__
+                                auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
+                                temp      = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
+
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
+
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
+
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
+
+#endif // __aarch64__
+
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover) + input_offset;
+                                    const auto w_val   = *(weights_ptr_mover) + weights_offset;
+                                    acc += src_val * w_val;
+                                }
+                            }
+                        }
+                    }
+
+                    if (biases)
+                    {
+                        acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
+                    }
+
+                    T out_val =
+                        finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+                },
+                wei);
+        },
+        out);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h
new file mode 100644
index 0000000000..8fb7ad2087
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
+#define SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+inline float32x4_t load_as_f32(T *ptr)
+{
+    ARM_COMPUTE_UNUSED(ptr);
+    ARM_COMPUTE_ERROR("Type not supported.");
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template <>
+inline float32x4_t load_as_f32(float16_t *ptr)
+{
+    return vcvt_f32_f16(wrapper::vload(ptr));
+}
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
+template <>
+inline float32x4_t load_as_f32(float *ptr)
+{
+    return wrapper::vloadq(ptr);
+}
+
+template <>
+inline float32x4_t load_as_f32(int32_t *ptr)
+{
+    return vcvtq_f32_s32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint32_t *ptr)
+{
+    return vcvtq_f32_u32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(int16_t *ptr)
+{
+    return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint16_t *ptr)
+{
+    return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint8_t *ptr)
+{
+    return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..3739c9d4e0
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
+{
+    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..f665c3652c
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
+{
+    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h
new file mode 100644
index 0000000000..b90ba9ddbf
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/impl.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CROP_IMPL_H
+#define SRC_CORE_NEON_KERNELS_CROP_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/crop/generic/neon/crop_helper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void in_bounds_crop_window(const ITensor *input,
+                           const ITensor *output,
+                           float         *output_ptr,
+                           Coordinates    input_offset,
+                           int32_t        window_step_x,
+                           int32_t        output_width_start,
+                           int32_t        output_width_limit,
+                           bool           input_has_single_channel,
+                           bool           is_width_flipped)
+{
+    // Reverse elements if width flipped.
+    if (is_width_flipped)
+    {
+        // Collapse first dimension if possible.
+        if (input_has_single_channel)
+        {
+            int32_t     x = output_width_start;
+            Coordinates negative_offset(input_offset);
+            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
+            for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
+
+                in = wrapper::vrev64(in);
+                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+
+                wrapper::vstore(output_ptr + x, in);
+            }
+            input_offset[1] = negative_offset[1] + window_step_x - 1;
+            for (; x < output_width_limit; ++x, --input_offset[1])
+            {
+                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+        else
+        {
+            for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+            {
+                input_offset.set(0, 0);
+                int32_t c = 0;
+                for (; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x;
+                     c += window_step_x, input_offset[0] += window_step_x)
+                {
+                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
+                }
+                for (; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+                {
+                    *(output_ptr + x * output->info()->dimension(0) + c) =
+                        static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                }
+            }
+        }
+    }
+    else
+    {
+        // Use memcpy if the elements don't need converting to float.
+        if (std::is_same<T, float>::value)
+        {
+            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
+                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
+                   (output_width_limit - output_width_start) * output->info()->dimension(0) *
+                       output->info()->element_size());
+        }
+        else
+        {
+            int32_t x = 0;
+            int32_t limit =
+                (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+            for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                wrapper::vstore(output_start_ptr + x, in);
+            }
+            for (; x < limit; ++x, ++input_offset[0])
+            {
+                *(output_start_ptr + x) =
+                    static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_CROP_IMPL_H
diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp
new file mode 100644
index 0000000000..602434f54f
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/integer.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+#include "src/cpu/kernels/crop/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void u8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
+{
+    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void u16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
+{
+    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void u32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
+{
+    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void s8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
+{
+    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                         output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void s16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
+{
+    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void s32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
+{
+    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h
new file mode 100644
index 0000000000..9cb7726203
--- /dev/null
+++ b/src/cpu/kernels/crop/list.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CROP_LIST_H
+#define SRC_CORE_NEON_KERNELS_CROP_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_CROP_KERNEL(func_name)                                                                       \
+    void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \
+                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit,            \
+                   bool input_has_single_channel, bool is_width_flipped)
+
+DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(s8_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(s16_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(s32_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(u8_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(u16_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(u32_in_bounds_crop_window);
+
+#undef DECLARE_CROP_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_CROP_LIST_H
diff --git a/src/cpu/kernels/depth_to_space/list.h b/src/cpu/kernels/depth_to_space/list.h
new file mode 100644
index 0000000000..9d0cd1e740
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/list.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
+#define ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#define DECLARE_DEPTHTOSPACE_KERNEL(func_name)                                                                     \
+    void func_name(const uint8_t *src, uint8_t *dst, const uintptr_t src_shape[4], const uintptr_t src_strides[4], \
+                   const uintptr_t dst_strides[4], uintptr_t element_size, uintptr_t block_size)
+
+DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nhwc_any);
+DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nchw_any);
+
+#undef DECLARE_DEPTHTOSPACE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
diff --git a/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp
new file mode 100644
index 0000000000..0277690112
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <cstring>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void depth_to_space_nchw_any( //
+    const uint8_t  *src,
+    uint8_t        *dst,
+    const uintptr_t src_shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t dst_strides[4],
+    uintptr_t       element_size,
+    uintptr_t       block_size)
+{
+    ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
+    ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
+
+    const auto dst_channels         = src_shape[2] / (block_size * block_size);
+    const auto src_block_col_stride = dst_channels * src_strides[2];
+    const auto src_block_row_stride = block_size * dst_channels * src_strides[2];
+
+    auto *src_batch_ptr = src;
+    auto *dst_batch_ptr = dst;
+
+    for (uintptr_t batch = 0; batch < src_shape[3]; ++batch)
+    {
+        auto *src_channel_ptr = src_batch_ptr;
+        auto *dst_channel_ptr = dst_batch_ptr;
+
+        for (uintptr_t channel = 0; channel < dst_channels; ++channel)
+        {
+            auto *src_height_block_ptr = src_channel_ptr;
+            auto *dst_row_ptr          = dst_channel_ptr;
+
+            for (uintptr_t height_block = 0; height_block < src_shape[1]; ++height_block)
+            {
+                auto *src_block_row_ptr = src_height_block_ptr;
+
+                for (uintptr_t block_row = 0; block_row < block_size; ++block_row)
+                {
+                    auto *src_width_block_ptr = src_block_row_ptr;
+                    auto *dst_col_ptr         = dst_row_ptr;
+
+                    for (uintptr_t width_block = 0; width_block < src_shape[0]; ++width_block)
+                    {
+                        auto *src_block_col_ptr = src_width_block_ptr;
+
+                        for (uintptr_t block_col = 0; block_col < block_size; ++block_col)
+                        {
+                            // The source pointer is accumulated as:
+                            //
+                            // src_block_col_ptr =
+                            //   src +
+                            //   batch * dst_strides[3] +
+                            //   (channel + (block_row * block_size + block_col) * dst_channels) * src_strides[2] +
+                            //   height_block * src_strides[1] +
+                            //   width_block * element_size;
+                            //
+                            // The destination pointer is accumuated as:
+                            //
+                            // dst_col_ptr =
+                            //   dst +
+                            //   batch * dst_strides[3] +
+                            //   channel * dst_strides[2] +
+                            //   (height_block * block_size + block_row) * dst_strides[1] +
+                            //   (width_block * block_size + block_col) * element_size
+
+                            std::memcpy(dst_col_ptr, src_block_col_ptr, element_size);
+
+                            src_block_col_ptr += src_block_col_stride;
+                            dst_col_ptr += element_size;
+                        }
+
+                        src_width_block_ptr += element_size;
+                    }
+
+                    src_block_row_ptr += src_block_row_stride;
+                    dst_row_ptr += dst_strides[1];
+                }
+
+                src_height_block_ptr += src_strides[1];
+            }
+
+            src_channel_ptr += src_strides[2];
+            dst_channel_ptr += dst_strides[2];
+        }
+
+        src_batch_ptr += src_strides[3];
+        dst_batch_ptr += dst_strides[3];
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp
new file mode 100644
index 0000000000..b1c84599dc
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <cstring>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void depth_to_space_nhwc_any( //
+    const uint8_t  *src,
+    uint8_t        *dst,
+    const uintptr_t src_shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t dst_strides[4],
+    uintptr_t       element_size,
+    uintptr_t       block_size)
+{
+    ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
+    ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
+
+    const auto src_block_row_stride   = (src_shape[0] / block_size) * element_size;
+    const auto dst_width_block_stride = block_size * dst_strides[1];
+
+    auto *src_batch_ptr = src;
+    auto *dst_batch_ptr = dst;
+
+    for (uintptr_t batch = 0; batch < src_shape[3]; ++batch)
+    {
+        auto *src_height_block_ptr = src_batch_ptr;
+        auto *dst_row_ptr          = dst_batch_ptr;
+
+        for (uintptr_t height_block = 0; height_block < src_shape[2]; ++height_block)
+        {
+            auto *src_block_row_ptr = src_height_block_ptr;
+
+            for (uintptr_t block_row = 0; block_row < block_size; ++block_row)
+            {
+                auto *src_width_block_ptr = src_block_row_ptr;
+                auto *dst_width_block_ptr = dst_row_ptr;
+
+                for (uintptr_t width_block = 0; width_block < src_shape[1]; ++width_block)
+                {
+                    // The source pointer is accumulated as:
+                    //
+                    // src_width_block_ptr =
+                    //   src +
+                    //   batch * src_strides[3] +
+                    //   height_block * src_strides[2] +
+                    //   width_block * src_strides[1] +
+                    //   block_row * (src_shape[0] / block_size) * element_size;
+                    //
+                    // The destination pointer is accumulated as:
+                    //
+                    // dst_width_block_ptr =
+                    //     dst +
+                    //     batch * dst_strides[3] +
+                    //     (height_block * block_size + block_row) * dst_strides[2] +
+                    //     width_block * block_size * dst_strides[1];
+
+                    std::memcpy(dst_width_block_ptr, src_width_block_ptr, src_block_row_stride);
+
+                    src_width_block_ptr += src_strides[1];
+                    dst_width_block_ptr += dst_width_block_stride;
+                }
+
+                src_block_row_ptr += src_block_row_stride;
+                dst_row_ptr += dst_strides[2];
+            }
+
+            src_height_block_ptr += src_strides[2];
+        }
+
+        src_batch_ptr += src_strides[3];
+        dst_batch_ptr += dst_strides[3];
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..293e606d81
--- /dev/null
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
+{
+    return run_depthwise_float<float16_t, float16_t>(src, weights, bias, dst, window, has_biases, info);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..c6fa4790b7
--- /dev/null
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
+{
+    return run_depthwise_float<float, float>(src, weights, bias, dst, window, has_biases, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
new file mode 100644
index 0000000000..d08e973968
--- /dev/null
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b)
+{
+    return vqrdmulhq_n_s32(a, b);
+}
+
+inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b)
+{
+    return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
+}
+
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent)
+{
+    const int32x4_t shift = vdupq_n_s32(-exponent);
+    const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+    const int32x4_t fixed = vqaddq_s32(x, fixup);
+    return vrshlq_s32(fixed, shift);
+}
+
+inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent)
+{
+    const int32x2_t shift = vdup_n_s32(-exponent);
+    const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
+    const int32x2_t fixed = vqadd_s32(x, fixup);
+    return vrshl_s32(fixed, shift);
+}
+
+inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
+{
+    const int32x2_t xs = vdup_n_s32(x);
+    return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
+}
+
+namespace
+{
+template <typename T, typename TW>
+void depthwise_loop_multiplier1_quantized(const ITensor       *src,
+                                          const ITensor       *weights,
+                                          const ITensor       *biases,
+                                          ITensor             *dst,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          std::vector<int>     output_multiplier,
+                                          std::vector<int>     output_shift,
+                                          const Window        &window,
+                                          bool                 has_biases) // NOLINT
+{
+    ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+    using AccType                     = int32_t;
+    using AccArrayType                = std::array<AccType, element_per_vector>;
+
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
+
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
+    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
+
+    Window win_input = window;
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = win_input;
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if (has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            auto const    base_weights_ptr  = weights_it.ptr();
+            size_t        x                 = run_info.x_start;
+
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
+            {
+                AccArrayType acc{};
+                AccArrayType in_sum{};
+                AccArrayType we_sum{};
+
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : out_of_bound_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        for (size_t i = 0; i < element_per_vector; ++i)
+                        {
+                            acc.at(i) += input_vals[i] * weights_vals[i];
+                            in_sum.at(i) += input_vals[i];
+                            we_sum.at(i) += weights_vals[i];
+                        }
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
+
+                VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+                for (size_t i = 0; i < element_per_vector; ++i)
+                {
+                    acc.at(i) -= in_sum.at(i) * weights_qoffset;
+                    acc.at(i) -= we_sum.at(i) * input_qoffset;
+                    acc.at(i) += k_offset;
+
+                    if (has_biases)
+                    {
+                        acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                    }
+
+                    const int32_t out_mul   = output_multiplier.at(x + i);
+                    const int32_t out_shift = output_shift.at(x + i);
+                    if (out_shift < 0)
+                    {
+                        acc.at(i) =
+                            saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                    }
+                    else
+                    {
+                        acc.at(i) =
+                            rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) +
+                            output_qoffset;
+                    }
+                    out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+                }
+
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
+            }
+
+            // left-over
+            for (; x < run_info.x_end; ++x)
+            {
+                AccType acc    = 0;
+                AccType in_sum = 0;
+                AccType we_sum = 0;
+
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_val =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : out_of_bound_value;
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc += input_val * weights_val;
+                        in_sum += input_val;
+                        we_sum += weights_val;
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
+
+                T out_vals{0};
+
+                acc -= in_sum * weights_qoffset;
+                acc -= we_sum * input_qoffset;
+                acc += k_offset;
+
+                if (has_biases)
+                {
+                    acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+                }
+
+                const int32_t out_mul   = output_multiplier.at(x);
+                const int32_t out_shift = output_shift.at(x);
+
+                if (out_shift < 0)
+                {
+                    acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc =
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+                }
+
+                out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
+            }
+        },
+        input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_generic_quantized(const ITensor       *src,
+                                      const ITensor       *weights,
+                                      const ITensor       *biases,
+                                      ITensor             *dst,
+                                      const PadStrideInfo &conv_info,
+                                      const Size2D        &dilation,
+                                      unsigned int         depth_multiplier,
+                                      std::vector<int>     output_multiplier,
+                                      std::vector<int>     output_shift,
+                                      const Window        &window,
+                                      bool                 has_biases) // NOLINT
+{
+    using AccType = int32_t;
+
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
+    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if (has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::vector<AccType> acc(depth_multiplier, 0);
+            std::vector<AccType> we_sum(depth_multiplier, 0);
+            AccType              in_sum = 0;
+
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : out_of_bound_value;
+
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) += input_val * weights_val;
+
+                        we_sum.at(m) += weights_val;
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                    in_sum += input_val;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            for (size_t m = 0; m < depth_multiplier; ++m)
+            {
+                acc.at(m) -= in_sum * weights_qoffset;
+                acc.at(m) -= we_sum.at(m) * input_qoffset;
+                acc.at(m) += k_offset;
+
+                if (has_biases)
+                {
+                    acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                }
+
+                const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
+                const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+                if (out_shift < 0)
+                {
+                    acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) +
+                                output_qoffset;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) =
+                    static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
+            }
+        },
+        input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor       *src,
+                                              const ITensor       *weights,
+                                              const ITensor       *biases,
+                                              ITensor             *dst,
+                                              const PadStrideInfo &conv_info,
+                                              const Size2D        &dilation,
+                                              unsigned int         depth_multiplier,
+                                              std::vector<int>     output_multiplier,
+                                              std::vector<int>     output_shift,
+                                              const Window        &window,
+                                              bool                 has_biases) // NOLINT
+{
+    constexpr int half_vec = vector_size / 2;
+
+    using AccType          = int32_t;
+    using AccVectorType    = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
+    using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
+    using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
+
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    const auto input_qoffset_vec = wrapper::vreinterpret(
+        wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
+    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(
+        wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset,
+                                                     arm_compute::wrapper::traits::vector_128_tag{});
+
+    const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
+    const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
+    const auto zero  = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
+
+    const auto out_mul   = output_multiplier.at(0);
+    const auto out_shift = output_shift.at(0);
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if (has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
+    std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
+
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::fill(begin(acc0), end(acc0), zero);
+            std::fill(begin(acc1), end(acc1), zero);
+
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                const int32_t current_h = input_z + h * dilation.y();
+                if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+                {
+                    int offs = input_offset;
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const int32_t current_w = input_y + w * dilation.x();
+                        if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+                        {
+                            const auto input_8x8 = wrapper::vdup_n(
+                                *(reinterpret_cast<T *>(
+                                    input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))),
+                                TagType{});
+                            const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+                            const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
+
+                            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                            {
+                                const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(
+                                    weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                                const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+                                const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+                                acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs),
+                                                            wrapper::vgetlow(weights_no_offs));
+                                acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs),
+                                                            wrapper::vgethigh(weights_no_offs));
+                            }
+                        }
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+            {
+                if (has_biases)
+                {
+                    const auto bias_val0 =
+                        wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                    const auto bias_val1 = wrapper::vloadq(
+                        reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+
+                    acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+                    acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+                }
+
+                if (out_shift < 0)
+                {
+                    acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                }
+                else
+                {
+                    acc0.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                }
+
+                acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+                acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+
+                const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i)));
+
+                if (std::is_same<T, uint8_t>::value)
+                {
+                    wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)),
+                                    wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+                }
+                else
+                {
+                    wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)),
+                                    wrapper::vqmovn(out_val));
+                }
+            }
+        },
+        input_it, weights_it, biases_it, output_it);
+}
+} // namespace
+
+template <typename T, typename TW>
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info)
+{
+    PadStrideInfo    conv_info        = info.pad_stride_info;
+    unsigned int     depth_multiplier = info.depth_multiplier;
+    Size2D           dilation         = info.dilation;
+    std::vector<int> output_multiplier;
+    std::vector<int> output_shift;
+
+    const auto input_scale   = src->info()->quantization_info().uniform().scale;
+    const auto output_scale  = dst->info()->quantization_info().uniform().scale;
+    auto       weights_scale = weights->info()->quantization_info().scale();
+
+    if (!is_data_type_quantized_per_channel(weights->info()->data_type()))
+    {
+        for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
+        {
+            weights_scale.push_back(weights_scale.front());
+        }
+    }
+
+    for (const auto &s : weights_scale)
+    {
+        int32_t     out_mult   = 0;
+        int32_t     out_shift  = 0;
+        const float multiplier = input_scale * s / output_scale;
+        arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
+
+        output_multiplier.push_back(out_mult);
+        output_shift.push_back(out_shift);
+    }
+
+    if (depth_multiplier == 1)
+    {
+        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier,
+                                                    output_shift, window, has_biases);
+    }
+    else
+    {
+        const bool is_pow2                 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
+        const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
+
+        if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
+        {
+            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation,
+                                                            depth_multiplier, output_multiplier, output_shift, window,
+                                                            has_biases);
+        }
+        else
+        {
+            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier,
+                                                    output_multiplier, output_shift, window, has_biases);
+        }
+    }
+}
+template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor         *src,
+                                                             const ITensor         *weights,
+                                                             const ITensor         *biases,
+                                                             ITensor               *dst,
+                                                             const Window          &window,
+                                                             bool                   has_biases,
+                                                             const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor         *src,
+                                                           const ITensor         *weights,
+                                                           const ITensor         *biases,
+                                                           ITensor               *dst,
+                                                           const Window          &window,
+                                                           bool                   has_biases,
+                                                           const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor         *src,
+                                                            const ITensor         *weights,
+                                                            const ITensor         *biases,
+                                                            ITensor               *dst,
+                                                            const Window          &window,
+                                                            bool                   has_biases,
+                                                            const ConvolutionInfo &info);
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
new file mode 100644
index 0000000000..3fa5c58c3c
--- /dev/null
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
+#define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+struct ConvolutionInfo;
+
+namespace cpu
+{
+constexpr auto data_layout = DataLayout::NHWC;
+const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
+constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
+constexpr size_t vector_size          = 8;
+
+struct DepthwiseConvolutionRunInfo
+{
+    const size_t   num_read_elements_per_iteration;
+    const uint32_t x_start;
+    const uint32_t x_end;
+    const uint32_t x_step;
+    const uint32_t x_leftover_start;
+    const size_t   input_stride_y;
+    const size_t   input_stride_z;
+    const size_t   input_max_offset;
+    const size_t   weights_width;
+    const size_t   weights_height;
+    const size_t   weights_stride_y;
+    const size_t   weights_stride_z;
+    const size_t   conv_stride_x;
+    const size_t   conv_stride_y;
+    const size_t   conv_pad_left;
+    const size_t   conv_pad_top;
+    const size_t   input_height;
+    const size_t   input_width;
+    const size_t   input_depth;
+
+    DepthwiseConvolutionRunInfo(const ITensorInfo   &input,
+                                const ITensorInfo   &weights,
+                                const PadStrideInfo &conv_info,
+                                const Window        &w,
+                                uint32_t             depth_multiplier = 1) // NOLINT
+        : num_read_elements_per_iteration(
+              (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+          x_start(w.x().start()),
+          x_end(w.x().end()),
+          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
+          x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
+          input_stride_y(input.strides_in_bytes().y()),
+          input_stride_z(input.strides_in_bytes().z()),
+          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) -
+                           (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+          weights_width(weights.dimension(width_idx)),
+          weights_height(weights.dimension(height_idx)),
+          weights_stride_y(weights.strides_in_bytes().y()),
+          weights_stride_z(weights.strides_in_bytes().z()),
+          conv_stride_x(conv_info.stride().first),
+          conv_stride_y(conv_info.stride().second),
+          conv_pad_left(conv_info.pad_left()),
+          conv_pad_top(conv_info.pad_top()),
+          input_height(input.dimension(height_idx)),
+          input_width(input.dimension(width_idx)),
+          input_depth(input.dimension(channel_idx))
+    {
+    }
+};
+
+inline bool is_valid_input_region(int32_t                            base_w,
+                                  uint32_t                           base_h,
+                                  uint32_t                           w,
+                                  uint32_t                           h,
+                                  const DepthwiseConvolutionRunInfo &run_info,
+                                  const Size2D                      &dilation)
+{
+    const int32_t current_h  = base_h + h * dilation.y();
+    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
+
+    const int32_t current_w  = base_w + w * dilation.x();
+    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
+
+    return is_valid_h && is_valid_w;
+}
+
+template <typename T>
+void depthwise_loop_multiplier1_fp(const ITensor       *src,
+                                   const ITensor       *weights,
+                                   const ITensor       *biases,
+                                   ITensor             *dst,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   const Window        &window,
+                                   bool                 has_biases)
+{
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
+
+    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
+
+    Window win_input = window;
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = win_input;
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if (has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto const base_weights_ptr = weights_it.ptr();
+            uint32_t   x                = run_info.x_start;
+
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
+            {
+                VectorType acc          = zero_vector;
+                auto       weights_ptr  = base_weights_ptr;
+                int64_t    input_offset = base_input_offset;
+
+                for (uint32_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (uint32_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : zero_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                        acc = wrapper::vmla(acc, weights_vals, input_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
+
+                if (has_biases)
+                {
+                    const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc                    = wrapper::vadd(acc, biases_vals);
+                }
+
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
+            }
+
+            for (; x < run_info.x_end; ++x)
+            {
+                auto    acc_scalar   = T{0};
+                auto    weights_ptr  = base_weights_ptr;
+                int64_t input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : 0;
+                        const auto weights_vals =
+                            *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc_scalar += (input_vals * weights_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
+
+                if (has_biases)
+                {
+                    const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc_scalar += biases_vals;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
+            }
+        },
+        input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T>
+void depthwise_loop_generic_fp(const ITensor       *src,
+                               const ITensor       *weights,
+                               const ITensor       *biases,
+                               ITensor             *dst,
+                               const PadStrideInfo &conv_info,
+                               const Size2D        &dilation,
+                               unsigned int         depth_multiplier,
+                               const Window        &window,
+                               bool                 has_biases)
+{
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if (has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::vector<T> acc(depth_multiplier, static_cast<T>(0));
+
+            const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : T(0);
+
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            if (has_biases)
+            {
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+                }
+            }
+            else
+            {
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+                }
+            }
+        },
+        input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T, typename TW>
+void run_depthwise_float(const ITensor         *src,
+                         const ITensor         *weights,
+                         const ITensor         *biases,
+                         ITensor               *dst,
+                         const Window          &window,
+                         bool                   has_biases,
+                         const ConvolutionInfo &info)
+{
+    PadStrideInfo conv_info        = info.pad_stride_info;
+    unsigned int  depth_multiplier = info.depth_multiplier;
+    Size2D        dilation         = info.dilation;
+
+    if (depth_multiplier == 1)
+    {
+        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
+    }
+    else
+    {
+        depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window,
+                                     has_biases);
+    }
+}
+
+template <typename T, typename TW>
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..d32847c1e8
--- /dev/null
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
+{
+    return run_depthwise_quanitized8bit<uint8_t, uint8_t>(src, weights, bias, dst, window, has_biases, info);
+}
+
+void neon_qp8_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
+{
+    return run_depthwise_quanitized8bit<uint8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..682fad0bda
--- /dev/null
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
+{
+    return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
+}
+
+void neon_qp8_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
+{
+    return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h
new file mode 100644
index 0000000000..cf80608f4f
--- /dev/null
+++ b/src/cpu/kernels/depthwiseconv2d/list.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_DEPTHWISECONV2D_LIST_H
+#define SRC_CORE_NEON_KERNELS_DEPTHWISECONV2D_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name)                                                 \
+    void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \
+                   const Window &window, bool has_biases, const ConvolutionInfo &info)
+DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative);
+DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative);
+DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative);
+DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp32_deptwiseconv2dnative);
+DECLARE_DEPTHWISECONV2D_KERNEL(neon_qp8_qu8_deptwiseconv2dnative);
+DECLARE_DEPTHWISECONV2D_KERNEL(neon_qp8_qs8_deptwiseconv2dnative);
+#undef DECLARE_DEPTHWISECONV2D_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_DEPTHWISECONV2D_LIST_H
diff --git a/src/cpu/kernels/dequantize/generic/neon/fp16.cpp b/src/cpu/kernels/dequantize/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..caffdf53e1
--- /dev/null
+++ b/src/cpu/kernels/dequantize/generic/neon/fp16.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/dequantize/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
+{
+    run_dequantization_core<float16_t>(input, output, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/dequantize/generic/neon/fp32.cpp b/src/cpu/kernels/dequantize/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..58e987b450
--- /dev/null
+++ b/src/cpu/kernels/dequantize/generic/neon/fp32.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/dequantize/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
+{
+    run_dequantization_core<float>(input, output, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/dequantize/generic/neon/impl.h b/src/cpu/kernels/dequantize/generic/neon/impl.h
new file mode 100644
index 0000000000..7197d4dff6
--- /dev/null
+++ b/src/cpu/kernels/dequantize/generic/neon/impl.h
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/dequantize/generic/neon/list.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+template <typename T>
+inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+    ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <>
+inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+    wrapper::vstore(ptr, v.val[0]);
+    wrapper::vstore(ptr + 4, v.val[1]);
+    wrapper::vstore(ptr + 8, v.val[2]);
+    wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+    wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T>
+inline void store_result(T *ptr, const float32x4x2_t &v)
+{
+    ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <>
+inline void store_result<float>(float *ptr, const float32x4x2_t &v)
+{
+    wrapper::vstore(ptr, v.val[0]);
+    wrapper::vstore(ptr + 4, v.val[1]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
+{
+    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename TOut, typename TIn>
+void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo  = input->info()->quantization_info().uniform();
+    const float                    scale  = qinfo.scale;
+    const int32_t                  offset = qinfo.offset;
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale, offset);
+
+                store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto val       = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
+{
+    const auto scale = input->info()->quantization_info().scale();
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Reset first dimension to handle tail calculations manually
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win);
+    Iterator out(output, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale[id.z()]);
+
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
+{
+    const auto scale = input->info()->quantization_info().scale();
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Reset first dimension to handle tail calculations manually
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win);
+    Iterator out(output, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4],
+                                               scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9],
+                                               scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13],
+                                               scale[x + 14], scale[x + 15]}};
+                const auto          vin    = wrapper::vloadq(in_ptr + x);
+                const auto          vdeq   = vdequantize(vin, vscale);
+
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const float                    scale = qinfo.scale;
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale);
+
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const float                    scale = qinfo.scale;
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize_int16(vin, scale);
+
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int16_t val    = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
+{
+    switch (input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_dequantization_qasymm8<T, uint8_t>(input, output, window);
+            break;
+        case DataType::QASYMM8_SIGNED:
+            run_dequantization_qasymm8<T, int8_t>(input, output, window);
+            break;
+        case DataType::QSYMM8_PER_CHANNEL:
+            input->info()->data_layout() == DataLayout::NHWC
+                ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window)
+                : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
+            break;
+        case DataType::QSYMM8:
+            run_dequantization_qsymm8<T>(input, output, window);
+            break;
+        case DataType::QSYMM16:
+            run_dequantization_qsymm16<T>(input, output, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/dequantize/generic/neon/list.h b/src/cpu/kernels/dequantize/generic/neon/list.h
new file mode 100644
index 0000000000..678eb2c01a
--- /dev/null
+++ b/src/cpu/kernels/dequantize/generic/neon/list.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#define DECLARE_DEQUANTIZE_KERNEL(func_name) void func_name(const ITensor *input, ITensor *output, const Window &window)
+
+DECLARE_DEQUANTIZE_KERNEL(fp32_run_dequantization_core);
+DECLARE_DEQUANTIZE_KERNEL(fp16_run_dequantization_core);
+
+#undef DECLARE_DEQUANTIZE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/directconv2d/impl.h b/src/cpu/kernels/directconv2d/impl.h
new file mode 100644
index 0000000000..d3965326cd
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/impl.h
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T, bool has_pads>
+void linearize_volume_nchw(const uint8_t *const in_ptr,
+                           T                   *out_ptr,
+                           bool                 has_bias,
+                           int                  top_left_x,
+                           int                  top_left_y,
+                           int                  kernel_width,
+                           int                  kernel_height,
+                           int                  kernel_depth,
+                           int                  input_w,
+                           int                  input_h,
+                           int                  input_stride_x,
+                           int                  input_stride_y,
+                           int                  input_stride_z,
+                           int                  pad_value,
+                           int                  dilation_x,
+                           int                  dilation_y)
+{
+    const int kernel_size2 = kernel_width * kernel_height;
+    const int x_e          = top_left_x + kernel_width * dilation_x;
+    const int y_e          = top_left_y + kernel_height * dilation_y;
+
+    // Linearize volume
+    int d = 0;
+    // This for loop linearize a volume with 3 slices. This allows:
+    // 1) to reduce the iterations of the outer for loop "d"
+    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
+    for (; d <= (kernel_depth - 3); d += 3)
+    {
+        for (int y = top_left_y; y < y_e; y += dilation_y)
+        {
+            if ((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be the offset (will be zeros when not quantized)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                {
+                    *(out_ptr + 0 * kernel_size2) = pad_value;
+                    *(out_ptr + 1 * kernel_size2) = pad_value;
+                    *(out_ptr + 2 * kernel_size2) = pad_value;
+                }
+            }
+            else
+            {
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                {
+                    if ((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *(out_ptr + 0 * kernel_size2) = pad_value;
+                        *(out_ptr + 1 * kernel_size2) = pad_value;
+                        *(out_ptr + 2 * kernel_size2) = pad_value;
+                    }
+                    else
+                    {
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+        out_ptr += 2 * kernel_size2;
+    }
+
+    // Left over
+    for (; d < kernel_depth; d++)
+    {
+        for (int y = top_left_y; y < y_e; y += dilation_y)
+        {
+            if ((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be the offset (will be zeros when not quantized)
+                memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
+                out_ptr += kernel_width;
+            }
+            else
+            {
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                {
+                    if ((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *out_ptr = pad_value;
+                    }
+                    else
+                    {
+                        *out_ptr = *(reinterpret_cast<const T *>(
+                            in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+    }
+
+    // Append 1 if the convolution layer has biases
+    if (has_bias)
+    {
+        *out_ptr = static_cast<T>(1);
+    }
+}
+
+template <typename T, bool has_pads>
+void linearize_volume_nhwc(const uint8_t *const in_ptr,
+                           T                   *out_ptr,
+                           bool                 has_bias,
+                           int                  start_x,
+                           int                  start_y,
+                           int                  kernel_width,
+                           int                  kernel_height,
+                           int                  input_w,
+                           int                  input_h,
+                           int                  input_c,
+                           int                  input_stride_y,
+                           int                  input_stride_z,
+                           int                  pad_value,
+                           int                  dilation_x,
+                           int                  dilation_y)
+{
+    const int end_x        = start_x + kernel_width * dilation_x;
+    const int end_y        = start_y + kernel_height * dilation_y;
+    const int pad_quant    = kernel_width * input_c;
+    const int element_size = static_cast<int>(sizeof(T));
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == input_c * element_size))
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            //optimized for no dilation and no boundary pixels
+            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                   input_c * kernel_width * element_size);
+            out_ptr += input_c * kernel_width;
+        }
+    }
+    else
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            if (y < 0 || y >= input_h)
+            {
+                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
+                out_ptr += pad_quant;
+            }
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
+            {
+                for (int x = start_x; x < end_x; x += dilation_x)
+                {
+                    if (x < 0 || x >= input_w)
+                    {
+                        memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
+                        out_ptr += input_c;
+                    }
+                    else
+                    {
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               input_c * element_size);
+                        out_ptr += input_c;
+                    }
+                }
+            }
+            else
+            {
+                //optimized for no dilation and no boundary pixels
+                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                       input_c * kernel_width * element_size);
+                out_ptr += input_c * kernel_width;
+            }
+        }
+    }
+    // Append 1 if the convolution layer has biases
+    if (has_bias)
+    {
+        *out_ptr = static_cast<T>(1);
+    }
+}
+
+template <typename T, bool has_pads>
+void linearize_volume_nhwc(const uint8_t *const in_ptr,
+                           T                   *out_ptr,
+                           bool                 has_bias,
+                           int                  start_x,
+                           int                  start_y,
+                           int                  kernel_width,
+                           int                  kernel_height,
+                           int                  input_w,
+                           int                  input_h,
+                           int                  input_c,
+                           int                  input_stride_y,
+                           int                  input_stride_z,
+                           int                  pad_value,
+                           int                  dilation_x,
+                           int                  dilation_y,
+                           int                  pad_right)
+{
+    const int end_x              = start_x + kernel_width * dilation_x;
+    const int end_y              = start_y + kernel_height * dilation_y;
+    const int pad_quant          = kernel_width * (input_c + pad_right);
+    const int element_size       = static_cast<int>(sizeof(T));
+    const int channel_chunk_size = input_c * element_size;
+
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == channel_chunk_size))
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
+            for (int e = 0; e < kernel_width; e++)
+            {
+                memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
+                out_ptr += input_c + pad_right;
+            }
+        }
+    }
+    else
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            if (y < 0 || y >= input_h)
+            {
+                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
+                out_ptr += pad_quant;
+            }
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
+            {
+                for (int x = start_x; x < end_x; x += dilation_x)
+                {
+                    if (x < 0 || x >= input_w)
+                    {
+                        memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size);
+                        out_ptr += input_c + pad_right;
+                    }
+                    else
+                    {
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               channel_chunk_size);
+                        out_ptr += input_c + pad_right;
+                    }
+                }
+            }
+            else
+            {
+                const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
+                for (int e = 0; e < kernel_width; e++)
+                {
+                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
+                           channel_chunk_size);
+                    out_ptr += input_c + pad_right;
+                }
+            }
+        }
+    }
+    // Append 1 if the convolution layer has biases
+    if (has_bias)
+    {
+        *out_ptr = static_cast<T>(1);
+    }
+}
+
+template <typename T, bool has_pads, bool is_nchw>
+void run_im2col(const ITensor                        *src,
+                ITensor                              *dst,
+                const Window                         &window,
+                DataLayout                            data_layout,
+                const PadStrideInfo                  &conv_info,
+                std::pair<unsigned int, unsigned int> convolved_dims,
+                const Size2D                         &kernel_dims,
+                const Size2D                         &dilation,
+                uint32_t                              input_pad_right,
+                bool                                  has_bias)
+{
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const int input_w        = src->info()->dimension(width_idx);
+    const int input_h        = src->info()->dimension(height_idx);
+    const int input_c        = src->info()->dimension(channel_idx);
+    const int input_stride_x = src->info()->strides_in_bytes().x();
+    const int input_stride_y = src->info()->strides_in_bytes().y();
+    const int input_stride_z = src->info()->strides_in_bytes().z();
+    const int pad_left       = conv_info.pad_left();
+    const int pad_top        = conv_info.pad_top();
+    const int stride_x       = conv_info.stride().first;
+    const int stride_y       = conv_info.stride().second;
+    const int pad_value =
+        is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
+
+    const auto kernel_width  = kernel_dims.width;
+    const auto kernel_height = kernel_dims.height;
+
+    Window window_in_out(window);
+    // The first three dimensions of the input and output are increased by the inner loops
+    window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Create iterators
+    Iterator in(src, window_in_out);
+    Iterator out(dst, window_in_out);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const int start_w = id[width_idx] * stride_x - pad_left;
+            const int start_h = id[height_idx] * stride_y - pad_top;
+
+            // Get pointers
+            const uint8_t *const input_ptr = in.ptr();
+            auto                 output_ptr =
+                reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * convolved_dims.first) *
+                                                      dst->info()->strides_in_bytes().y());
+
+            // Linearize volume
+            if (is_nchw)
+            {
+                linearize_volume_nchw<T, has_pads>(
+                    input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, kernel_height, input_c, input_w,
+                    input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, dilation.x(), dilation.y());
+            }
+            else
+            {
+                if (input_pad_right > 0)
+                {
+                    linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width,
+                                                       kernel_height, input_w, input_h, input_c, input_stride_y,
+                                                       input_stride_z, pad_value, dilation.x(), dilation.y(),
+                                                       input_pad_right);
+                }
+                else
+                {
+                    linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width,
+                                                       kernel_height, input_w, input_h, input_c, input_stride_y,
+                                                       input_stride_z, pad_value, dilation.x(), dilation.y());
+                }
+            }
+        },
+        in, out);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H
diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h
new file mode 100644
index 0000000000..e3ff46b148
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/list.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/common/Registrars.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+#define DECLARE_DIRECT_CONV2D_KERNEL(func_name)                                                    \
+    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \
+                   const PadStrideInfo &conv_info)
+
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d);
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d);
+DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nchw_directconv2d);
+
+#define DECLARE_IM2COL_KERNEL(func_name)                                                                 \
+    void func_name(const ITensor *src, ITensor *dst, const Window &window, DataLayout data_layout,       \
+                   const PadStrideInfo &conv_info, std::pair<unsigned int, unsigned int> convolved_dims, \
+                   const Size2D &kernel_dims, const Size2D &dilation, uint32_t input_pad_right, bool has_bias)
+
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_nopad);
+
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_nopad);
+
+#undef DECLARE_DIRECT_CONV2D_KERNEL
+#undef DECLARE_IM2COL_KERNEL
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H
diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp
new file mode 100644
index 0000000000..84f5eeff5a
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nchw/all.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/list.h"
+#include "src/cpu/kernels/directconv2d/nchw/impl.h"
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void neon_fp32_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    convolve_nchw<float>(window, src, weights, dst, conv_info);
+}
+
+void run_im2col_fp32_nchw_pad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<float, true, true>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                             kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_fp32_nchw_nopad(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<float, false, true>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                              kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_qasymm8_nchw_pad(const ITensor                        *src,
+                                 ITensor                              *dst,
+                                 const Window                         &window,
+                                 DataLayout                            data_layout,
+                                 const PadStrideInfo                  &conv_info,
+                                 std::pair<unsigned int, unsigned int> convolved_dims,
+                                 const Size2D                         &kernel_dims,
+                                 const Size2D                         &dilation,
+                                 uint32_t                              input_pad_right,
+                                 bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_qasymm8_nchw_nopad(const ITensor                        *src,
+                                   ITensor                              *dst,
+                                   const Window                         &window,
+                                   DataLayout                            data_layout,
+                                   const PadStrideInfo                  &conv_info,
+                                   std::pair<unsigned int, unsigned int> convolved_dims,
+                                   const Size2D                         &kernel_dims,
+                                   const Size2D                         &dilation,
+                                   uint32_t                              input_pad_right,
+                                   bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+void run_im2col_bf16_nchw_pad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, true, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_bf16_nchw_nopad(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, false, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nchw/fp16.cpp b/src/cpu/kernels/directconv2d/nchw/fp16.cpp
new file mode 100644
index 0000000000..a9cab42f56
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nchw/fp16.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/nchw/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+void neon_fp16_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
+}
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+void run_im2col_fp16_nchw_pad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, true, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+
+void run_im2col_fp16_nchw_nopad(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, false, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nchw/impl.h b/src/cpu/kernels/directconv2d/nchw/impl.h
new file mode 100644
index 0000000000..6a5b175d98
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nchw/impl.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void convolve_nchw(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_UNUSED(conv_info);
+
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
+    const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+
+    const int input_dim_w = src->info()->dimension(0);
+    const int input_dim_h = src->info()->dimension(1);
+
+    const int output_stride_c = dst->info()->strides_in_bytes()[2];
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
+
+    const int kernel_dim_w = weights->info()->dimension(0);
+    const int kernel_dim_h = weights->info()->dimension(1);
+
+    const int conv_pad_top  = conv_info.pad_top();
+    const int conv_pad_left = conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(conv_info.stride());
+    const int conv_stride_h = std::get<1>(conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // We are computing the theoretical starting input starting points
+            const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+            const int      index_c_end = weights->info()->dimension(2);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[3] * input_stride_n;
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+                    T              out_temp          = static_cast<T>(0);
+
+                    for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
+                    {
+                        const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
+                        const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
+                        {
+                            const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
+                            const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+                            int         index_w         = in_w_start;
+                            int         index_wei_w     = wei_w_start;
+                            vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_w <= ((in_w_end - num_elems_read_per_iteration));
+                                 index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_w < in_w_end; ++index_w, ++index_wei_w)
+                            {
+                                const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+                                const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp += src_val * w_val;
+                            }
+                        }
+                    }
+                    *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                },
+                wei);
+        },
+        out);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp
new file mode 100644
index 0000000000..f78601544f
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/nchw/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void run_im2col_fp16_pad(const ITensor                        *src,
+                         ITensor                              *dst,
+                         const Window                         &window,
+                         DataLayout                            data_layout,
+                         const PadStrideInfo                  &conv_info,
+                         std::pair<unsigned int, unsigned int> convolved_dims,
+                         const Size2D                         &kernel_dims,
+                         const Size2D                         &dilation,
+                         uint32_t                              input_pad_right,
+                         bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+
+void run_im2col_fp16_nopad(const ITensor                        *src,
+                           ITensor                              *dst,
+                           const Window                         &window,
+                           DataLayout                            data_layout,
+                           const PadStrideInfo                  &conv_info,
+                           std::pair<unsigned int, unsigned int> convolved_dims,
+                           const Size2D                         &kernel_dims,
+                           const Size2D                         &dilation,
+                           uint32_t                              input_pad_right,
+                           bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
new file mode 100644
index 0000000000..17d9212248
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void neon_fp32_nhwc_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    convolve_nhwc<float>(window, src, weights, dst, conv_info);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
new file mode 100644
index 0000000000..f235167e28
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <algorithm>
+
+using namespace arm_compute::detail;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
+{
+    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 &&
+            weights->padding().right == 0);
+}
+} // namespace
+
+template <typename T>
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+    const int input_dim_w    = src->info()->dimension(1);
+    const int input_dim_h    = src->info()->dimension(2);
+
+    const int output_stride_c = dst->info()->strides_in_bytes().x();
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
+    const int          kernel_dim_w    = weights->info()->dimension(1);
+    const int          kernel_dim_h    = weights->info()->dimension(2);
+
+    const int conv_pad_top  = conv_info.pad_top();
+    const int conv_pad_left = conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(conv_info.stride());
+    const int conv_stride_h = std::get<1>(conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    // nhwc optimized
+    if (have_zero_x_internal_padding(src->info(), weights->info()))
+    {
+        // This function assumes that input and weights have not padding in channel
+
+        /*
+        * This implementation parallelize the full WC plane of input and weights by
+        * treating them as series of elements. So for example, a 3x3 weights and
+        * floating point vector operations of 4 elements per time, the first 3
+        * channel elements of the first row would be taken and additionally the first
+        * element of the second row. The 9 elements in each single WC weight plane
+        * would require 2 4-element vector operations and a last single element operation.
+        *
+        * This works since when we create the input vector to multiply with the weights,
+        * the exact required elements are loaded in the same order. Therefore the
+        * multiplication works on the correct input/weight elements.
+        */
+        execute_window_loop(
+            window_out,
+            [&](const Coordinates &id)
+            {
+                /*
+            * In here we create theoretical indexes which then we validate for both
+            * inputs and weights.
+            * As a reminder, this loop take each output point in NHW, C is treated
+            * in the weights loop.
+            */
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+                // We use the input points to select the valid weight points to use
+                const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+                const int index_h_start  = in_h_start - in_h_start_t;
+                const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+                const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
+
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
+                    {
+                        /*
+                * This is the loop in the weights, and it goes along N (the batches)
+                * As a reminder, the batches of the weights are translated into the
+                * channels of the output
+                */
+                        const T *in_ptr_row =
+                            reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                            id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+                        const T *weights_ptr_row =
+                            reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+                        uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+
+                        T out_temp = static_cast<T>(0);
+                        for (int index_h = index_h_start; index_h < index_h_end;
+                             ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+                        {
+                            const T    *in_ptr_mover = in_ptr_row;
+                            int         index_wc     = index_wc_start;
+                            vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_wc <= index_wc_end - num_elems_read_per_iteration;
+                                 index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+                            {
+                                const auto src_val = *(in_ptr_mover);
+                                const auto w_val   = *(weights_ptr_row + index_wc);
+                                out_temp += src_val * w_val;
+                            }
+                        }
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
+            },
+            out);
+    }
+    else // nhwc non optimized
+    {
+        execute_window_loop(
+            window_out,
+            [&](const Coordinates &id)
+            {
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+                // We use the input points to select the valid weight points to use
+                const int wei_w_start = in_w_start - in_w_start_t;
+                const int wei_h_start = in_h_start - in_h_start_t;
+                const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+                const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+                const int      index_c_end = weights->info()->dimension(0);
+                const T *const in_ptr_start =
+                    reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                    id[3] * input_stride_n;
+
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
+                    {
+                        const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                        uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+
+                        T out_temp = static_cast<T>(0);
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
+                        {
+                            const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
+                            {
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c           = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c <= index_c_end - num_elems_read_per_iteration;
+                                     index_c += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration,
+                                     weights_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
+                                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
+                            }
+                        }
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
+            },
+            out);
+    }
+}
+
+template void convolve_nhwc<float>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
new file mode 100644
index 0000000000..efb9ce8e2a
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
+#define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
+
+#include "arm_compute/core/ITensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif //SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp
new file mode 100644
index 0000000000..4c6fbec63a
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void run_im2col_qasymm8_pad(const ITensor                        *src,
+                            ITensor                              *dst,
+                            const Window                         &window,
+                            DataLayout                            data_layout,
+                            const PadStrideInfo                  &conv_info,
+                            std::pair<unsigned int, unsigned int> convolved_dims,
+                            const Size2D                         &kernel_dims,
+                            const Size2D                         &dilation,
+                            uint32_t                              input_pad_right,
+                            bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_qasymm8_nopad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..9b4375f17c
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window);
+}
+
+template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+
+template <ComparisonOperation op>
+void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window);
+}
+
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..53ccd89dcc
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window);
+}
+
+template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+
+template <ComparisonOperation op>
+void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window);
+}
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
new file mode 100644
index 0000000000..78e3baf74b
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -0,0 +1,1316 @@
+/*
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
+
+#include "src/core/NEON/NEAsymm.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op, typename VectorType>
+typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
+{
+    using vec_type    = typename VectorType::type;
+    using scalar_type = typename VectorType::scalar_type;
+    using tag_type    = typename VectorType::tag_type;
+
+    vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
+
+    switch (op)
+    {
+        case ArithmeticOperation::MAX:
+            res = wrapper::vmax(a, b);
+            break;
+        case ArithmeticOperation::MIN:
+            res = wrapper::vmin(a, b);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+        {
+            const vec_type tmp = wrapper::vsub(a, b);
+            res                = wrapper::vmul(tmp, tmp);
+            break;
+        }
+        case ArithmeticOperation::PRELU:
+        {
+            const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
+            const vec_type tmp  = wrapper::vmul(a, b);
+            const auto     gt   = wrapper::vcgt(a, zero);
+
+            res = wrapper::vbsl(gt, a, tmp);
+            break;
+        }
+
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return res;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a,
+                                                          const ScalarType                &broadcast_value,
+                                                          const bool                       reorder)
+{
+    using tag_type = typename VectorType::tag_type;
+    using vec_type = typename VectorType::type;
+
+    vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
+    return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op(
+    const ITensor *in1,
+    const ITensor *in2,
+    ITensor       *out,
+    const Window  &window,
+    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+    int (*broadcast_func)(
+        int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_value, output_ptr, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                                       !is_broadcast_input_2 ? a : broadcast_value);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(input1_ptr + x);
+                    const auto b      = *(input2_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(a, b);
+                }
+            },
+            input1, input2, output);
+    }
+}
+
+template <ArithmeticOperation op, typename ScalarType>
+inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+    auto res = ScalarType(0);
+
+    switch (op)
+    {
+        case ArithmeticOperation::MAX:
+            res = std::max(a, b);
+            break;
+        case ArithmeticOperation::MIN:
+            res = std::min(a, b);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+        {
+            res = (a - b) * (a - b);
+            break;
+        }
+        case ArithmeticOperation::PRELU:
+        {
+            res = (a > 0 ? a : a * b);
+            break;
+        }
+        case ArithmeticOperation::DIV:
+        {
+            res = a / b;
+            break;
+        }
+        case ArithmeticOperation::POWER:
+        {
+            res = std::pow(a, b);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return res;
+}
+
+template <>
+inline int32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,
+                                                                                                   const int32x4_t &b)
+{
+    int32x4_t result;
+
+    // Neon(TM) does not have vector integer division
+    result[0] = a[0] / b[0];
+    result[1] = a[1] / b[1];
+    result[2] = a[2] / b[2];
+    result[3] = a[3] / b[3];
+
+    return result;
+}
+
+template <>
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                 const float32x4_t &b)
+{
+    return wrapper::vdiv(a, b);
+}
+
+template <>
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                   const float32x4_t &b)
+{
+    return wrapper::vpow(a, b);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
+{
+    return wrapper::vdiv(a, b);
+}
+
+template <>
+inline float16x8_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
+{
+    return wrapper::vpow(a, b);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_loop(int               window_start_x,
+                                      int               window_end_x,
+                                      int               window_step_x,
+                                      const ScalarType *input1_ptr,
+                                      const ScalarType *input2_ptr,
+                                      ScalarType       *output_ptr)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = wrapper::vloadq(input1_ptr + x);
+        const auto b = wrapper::vloadq(input2_ptr + x);
+        wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
+    }
+    return x;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_broadcast_loop(int               window_start_x,
+                                                int               window_end_x,
+                                                int               window_step_x,
+                                                const ScalarType *non_broadcast_input_ptr,
+                                                const ScalarType &broadcast_value,
+                                                ScalarType       *output_ptr,
+                                                const bool        reorder)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+        wrapper::vstore(output_ptr + x,
+                        elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
+    }
+    return x;
+}
+
+template <ArithmeticOperation op, typename VectorType>
+void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    using scalar_type = typename VectorType::scalar_type;
+
+    elementwise_op<scalar_type, scalar_type, VectorType>(
+        in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>,
+        &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
+        &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType>
+inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
+{
+    bool res = false;
+
+    switch (op)
+    {
+        case ComparisonOperation::Equal:
+            res = (a == b);
+            break;
+        case ComparisonOperation::NotEqual:
+            res = (a != b);
+            break;
+        case ComparisonOperation::Greater:
+            res = (a > b);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            res = (a >= b);
+            break;
+        case ComparisonOperation::Less:
+            res = (a < b);
+            break;
+        case ComparisonOperation::LessEqual:
+            res = (a <= b);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
+}
+
+template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
+{
+    OutputVectorType res = {0, 0, 0, 0};
+
+    switch (op)
+    {
+        case ComparisonOperation::Equal:
+            res = wrapper::vceq(a, b);
+            break;
+        case ComparisonOperation::NotEqual:
+            res = wrapper::vnot(wrapper::vceq(a, b));
+            break;
+        case ComparisonOperation::Greater:
+            res = wrapper::vcgt(a, b);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            res = wrapper::vcge(a, b);
+            break;
+        case ComparisonOperation::Less:
+            res = wrapper::vcgt(b, a);
+            break;
+        case ComparisonOperation::LessEqual:
+            res = wrapper::vcge(b, a);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return res;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType
+elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+{
+    InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a,
+                                                                      reorder ? a : broadcast_vector);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_8_loop(int                    window_start_x,
+                                                int                    window_end_x,
+                                                int                    window_step_x,
+                                                const InputScalarType *non_broadcast_input_ptr,
+                                                const InputScalarType &broadcast_value,
+                                                uint8_t               *output_ptr,
+                                                const bool             reorder)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, a);
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_16_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_32_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
+    }
+    if (x <= window_end_x - 4)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        for (int i = 0; i < 4; i++)
+        {
+            *(output_ptr + x + i) = wrapper::vgetlane(a, i);
+        }
+        x = +4;
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_8_loop(int                    window_start_x,
+                                      int                    window_end_x,
+                                      int                    window_step_x,
+                                      const InputScalarType *input1_ptr,
+                                      const InputScalarType *input2_ptr,
+                                      uint8_t               *output_ptr)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
+        wrapper::vstore(output_ptr + x, res);
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_16_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_32_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        auto       a    = wrapper::vloadq(input1_ptr + x);
+        auto       b    = wrapper::vloadq(input2_ptr + x);
+        const auto res  = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        a               = wrapper::vloadq(input1_ptr + x + 4);
+        b               = wrapper::vloadq(input2_ptr + x + 4);
+        const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
+    }
+    if (x <= window_end_x - 4)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        for (int i = 0; i < 4; i++)
+        {
+            *(output_ptr + x + i) = wrapper::vgetlane(res, i);
+        }
+        x = +4;
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+}
+
+inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+{
+    qasymm8x16_t        x   = vld1q_u8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+            scale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+                  scale),
+    }};
+    return out;
+}
+
+inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+{
+    qasymm8x16_signed_t x   = vld1q_s8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+    }};
+    return out;
+}
+
+inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
+{
+    const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
+    const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
+    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
+{
+    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
+    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
+    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+inline void
+store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+{
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
+    store_quantized(output_ptr, out);
+}
+
+inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
+{
+    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
+    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
+    vst1q_s8(output_ptr, vcombine_s8(pa, pb));
+}
+
+inline void store_quantized_signed(int8_t              *output_ptr,
+                                   const float32x4x4_t &rf,
+                                   const float32x4_t   &offset,
+                                   const float32x4_t   &invscale)
+{
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
+    store_quantized_signed(output_ptr, out);
+}
+
+template <ArithmeticOperation op>
+inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+{
+    return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
+}
+
+template <ArithmeticOperation op>
+inline int8_t
+elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+{
+    return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
+}
+
+template <ArithmeticOperation op>
+float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+    using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
+    float32x4x4_t out       = {{
+              elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
+    }};
+    return out;
+}
+
+template <ComparisonOperation op>
+inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+{
+    ARM_COMPUTE_UNUSED(qinfo);
+    return elementwise_comp_op_scalar<op>(a, b);
+}
+
+template <ComparisonOperation op>
+inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+    uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}};
+    return out;
+}
+
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_loop(int            window_start_x,
+                                                int            window_end_x,
+                                                int            window_step_x,
+                                                const uint8_t *input1_ptr,
+                                                const uint8_t *input2_ptr,
+                                                uint8_t       *output_ptr,
+                                                int32x4_t      voffset1,
+                                                int32x4_t      voffset2,
+                                                float32x4_t    vscale1,
+                                                float32x4_t    vscale2,
+                                                float32x4_t    voffseto,
+                                                float32x4_t    invvscaleo)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        // Get inputs and compute output
+        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
+        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+    }
+    return x;
+}
+
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_singed_loop(int           window_start_x,
+                                                       int           window_end_x,
+                                                       int           window_step_x,
+                                                       const int8_t *input1_ptr,
+                                                       const int8_t *input2_ptr,
+                                                       int8_t       *output_ptr,
+                                                       int32x4_t     voffset1,
+                                                       int32x4_t     voffset2,
+                                                       float32x4_t   vscale1,
+                                                       float32x4_t   vscale2,
+                                                       float32x4_t   voffseto,
+                                                       float32x4_t   invvscaleo)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        // Get inputs and compute output
+        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
+        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
+        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
+        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
+    }
+    return x;
+}
+
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_broadcast_loop(int            window_start_x,
+                                                          int            window_end_x,
+                                                          int            window_step_x,
+                                                          const uint8_t *non_broadcast_input_ptr,
+                                                          float32x4x4_t  broadcast_vector,
+                                                          uint8_t       *output_ptr,
+                                                          int32x4_t      voffset_non_broadcast,
+                                                          float32x4_t    vscale_non_broadcast,
+                                                          float32x4_t    voffseto,
+                                                          float32x4_t    invvscaleo,
+                                                          bool           reorder)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+    }
+    return x;
+}
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                                 int           window_end_x,
+                                                                 int           window_step_x,
+                                                                 const int8_t *non_broadcast_input_ptr,
+                                                                 float32x4x4_t broadcast_vector,
+                                                                 int8_t       *output_ptr,
+                                                                 int32x4_t     voffset_non_broadcast,
+                                                                 float32x4_t   vscale_non_broadcast,
+                                                                 float32x4_t   voffseto,
+                                                                 float32x4_t   invvscaleo,
+                                                                 bool          reorder)
+{
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
+    }
+    return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_loop(int            window_start_x,
+                                              int            window_end_x,
+                                              int            window_step_x,
+                                              const uint8_t *input1_ptr,
+                                              const uint8_t *input2_ptr,
+                                              uint8_t       *output_ptr,
+                                              int32x4_t      voffset1,
+                                              int32x4_t      voffset2,
+                                              float32x4_t    vscale1,
+                                              float32x4_t    vscale2,
+                                              float32x4_t    voffseto,
+                                              float32x4_t    invvscaleo)
+{
+    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
+        store_quantized(output_ptr + x, rf);
+    }
+    return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_signed_loop(int           window_start_x,
+                                                     int           window_end_x,
+                                                     int           window_step_x,
+                                                     const int8_t *input1_ptr,
+                                                     const int8_t *input2_ptr,
+                                                     uint8_t      *output_ptr,
+                                                     int32x4_t     voffset1,
+                                                     int32x4_t     voffset2,
+                                                     float32x4_t   vscale1,
+                                                     float32x4_t   vscale2,
+                                                     float32x4_t   voffseto,
+                                                     float32x4_t   invvscaleo)
+{
+    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
+        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
+        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
+        store_quantized(output_ptr + x, rf);
+    }
+    return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_broadcast_loop(int            window_start_x,
+                                                        int            window_end_x,
+                                                        int            window_step_x,
+                                                        const uint8_t *non_broadcast_input_ptr,
+                                                        float32x4x4_t  broadcast_vector,
+                                                        uint8_t       *output_ptr,
+                                                        int32x4_t      voffset_non_broadcast,
+                                                        float32x4_t    vscale_non_broadcast,
+                                                        float32x4_t    voffseto,
+                                                        float32x4_t    invvscaleo,
+                                                        bool           reorder)
+{
+    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        store_quantized(output_ptr + x, rf);
+    }
+    return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                               int           window_end_x,
+                                                               int           window_step_x,
+                                                               const int8_t *non_broadcast_input_ptr,
+                                                               float32x4x4_t broadcast_vector,
+                                                               uint8_t      *output_ptr,
+                                                               int32x4_t     voffset_non_broadcast,
+                                                               float32x4_t   vscale_non_broadcast,
+                                                               float32x4_t   voffseto,
+                                                               float32x4_t   invvscaleo,
+                                                               bool          reorder)
+{
+    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+    int x = window_start_x;
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        store_quantized(output_ptr + x, rf);
+    }
+    return x;
+}
+
+inline void elementwise_op_quantized(const ITensor *in1,
+                                     const ITensor *in2,
+                                     ITensor       *out,
+                                     const Window  &window,
+                                     uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                     int (*broadcast_func)(int,
+                                                           int,
+                                                           int,
+                                                           const uint8_t *,
+                                                           float32x4x4_t,
+                                                           uint8_t *,
+                                                           int32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           const bool),
+                                     int (*neon_func)(int,
+                                                      int,
+                                                      int,
+                                                      const uint8_t *,
+                                                      const uint8_t *,
+                                                      uint8_t *,
+                                                      int32x4_t,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t))
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 16;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
+
+    // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
+    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset + 0.5f);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
+
+    if (is_broadcast_across_x)
+    {
+        // Select the broadcast input on the X axis
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+
+        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
+        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+
+                const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
+        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
+
+        // Input1 quantization info
+        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
+        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
+
+        // Input2 quantization info
+        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
+        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
+    }
+}
+
+inline void
+elementwise_comp_quantized_signed(const ITensor *in1,
+                                  const ITensor *in2,
+                                  ITensor       *out,
+                                  const Window  &window,
+                                  uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                  int (*broadcast_func)(int,
+                                                        int,
+                                                        int,
+                                                        const int8_t *,
+                                                        float32x4x4_t,
+                                                        uint8_t *,
+                                                        int32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        const bool),
+                                  int (*neon_func)(int,
+                                                   int,
+                                                   int,
+                                                   const int8_t *,
+                                                   const int8_t *,
+                                                   uint8_t *,
+                                                   int32x4_t,
+                                                   int32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t))
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 16;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
+
+    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
+
+    if (is_broadcast_across_x)
+    {
+        // Select the broadcast input on the X axis
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+
+        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
+        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
+        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
+
+        // Input1 quantization info
+        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
+        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
+
+        // Input2 quantization info
+        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
+        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
+    }
+}
+
+inline void
+elementwise_op_quantized_signed(const ITensor *in1,
+                                const ITensor *in2,
+                                ITensor       *out,
+                                const Window  &window,
+                                int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                int (*broadcast_func)(int,
+                                                      int,
+                                                      int,
+                                                      const int8_t *,
+                                                      float32x4x4_t,
+                                                      int8_t *,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      const bool),
+                                int (*neon_func)(int,
+                                                 int,
+                                                 int,
+                                                 const int8_t *,
+                                                 const int8_t *,
+                                                 int8_t *,
+                                                 int32x4_t,
+                                                 int32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t))
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 16;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
+
+    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
+
+    if (is_broadcast_across_x)
+    {
+        // Select the broadcast input on the X axis
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+
+        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
+        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
+        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
+
+        // Input1 quantization info
+        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
+        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
+
+        // Input2 quantization info
+        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
+        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
+    }
+}
+
+template <ArithmeticOperation op>
+void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
+                             &elementwise_arithm_op_quantized_broadcast_loop<op>,
+                             &elementwise_arithm_op_quantized_loop<op>);
+}
+
+template <ArithmeticOperation op>
+void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
+                                    &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
+                                    &elementwise_arithm_op_quantized_singed_loop<op>);
+}
+
+template <ComparisonOperation op>
+void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
+                             &elementwise_comp_op_quantized_broadcast_loop<op>,
+                             &elementwise_comp_op_quantized_loop<op>);
+}
+
+template <ComparisonOperation op>
+void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
+                                      &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
+                                      &elementwise_comp_op_quantized_signed_loop<op>);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
new file mode 100644
index 0000000000..09ad13d5eb
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window);
+}
+
+template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+
+template <ArithmeticOperation op>
+void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window);
+}
+template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+
+template <ComparisonOperation op>
+void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window);
+}
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+
+template <ComparisonOperation op>
+void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window);
+}
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+
+template <ComparisonOperation op>
+void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window);
+}
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..d891f70644
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op_quantized<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+
+template <ComparisonOperation op>
+void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
+{
+    return elementwise_comp_op_quantized<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..b1f8e018f5
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+
+template <ComparisonOperation op>
+void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
+{
+    return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
new file mode 100644
index 0000000000..600c7f1c05
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<float16_t>(in1, in2, out, op, window);
+}
+
+template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+
+template <ComparisonOperation op>
+void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<float16_t>(in1, in2, out, op, window);
+}
+
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
new file mode 100644
index 0000000000..832a966883
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<float32_t>(in1, in2, out, op, window);
+}
+
+template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+
+template <ComparisonOperation op>
+void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<float>(in1, in2, out, op, window);
+}
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
new file mode 100644
index 0000000000..fa48407e9b
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+
+#include "src/core/NEON/SVEMath.h"
+
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::wrapper;
+
+template <typename ScalarType>
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+{
+    using VectorType = typename sve_vector<ScalarType>::type;
+
+    const auto all_true_pg = svptrue<ScalarType>();
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_vector  = svdup_n(broadcast_value);
+
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    VectorType res{};
+
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector,
+                                                                                               broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
+    }
+}
+template void elementwise_arithmetic_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+
+template <typename InputScalarType, typename OutputScalarType>
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+{
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
+
+    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+    const auto all_true_pg = svptrue<InputScalarType>();
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const auto broadcast_vector = svdup_n(broadcast_value);
+
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto       non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    const svbool_t   output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    OutputVectorType res{};
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, non_broadcast_vector, broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res =
+                        elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                  typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
+                    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
+    }
+}
+
+template void elementwise_comparison_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<uint8_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+
+template <>
+svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+{
+    return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+}
+
+template <>
+svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+{
+    return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+}
+
+template <>
+svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b)
+{
+    ARM_COMPUTE_UNUSED(pg, a, b);
+    ARM_COMPUTE_ERROR("Not supported");
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
new file mode 100644
index 0000000000..4c61b9f315
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
+#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/NEON/wrapper/svtraits.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::wrapper;
+
+template <typename VectorType>
+VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
+{
+    return svpow_z(pg, a, b);
+}
+
+template <typename VectorType>
+VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
+{
+    return svdiv_z(pg, a, b);
+}
+
+template <uint32_t bytewidth>
+svbool_t narrow_to_byte_predicate(svbool_t pg)
+{
+    const auto all_false = svpfalse();
+
+    switch (bytewidth)
+    {
+        case 8:
+            pg = svuzp1_b32(pg, all_false);
+        /* fall through */
+        case 4:
+            pg = svuzp1_b16(pg, all_false);
+        /* fall through */
+        case 2:
+            pg = svuzp1_b8(pg, all_false);
+        /* fall through */
+        default:
+            break;
+    }
+    return pg;
+}
+
+template <typename VectorType>
+VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
+{
+    using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
+    VectorType res{};
+
+    switch (op)
+    {
+        case ArithmeticOperation::MAX:
+            res = svmax_z(pg, a, b);
+            break;
+        case ArithmeticOperation::MIN:
+            res = svmin_z(pg, a, b);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+        {
+            const auto tmp = svsub_z(pg, a, b);
+            res            = svmul_z(pg, tmp, tmp);
+            break;
+        }
+        case ArithmeticOperation::PRELU:
+        {
+            const auto zero = svdup_n(ScalarType(0));
+            const auto tmp  = svmul_z(pg, a, b);
+            const auto gt   = svcmpgt(pg, a, zero);
+            res             = svsel(gt, a, tmp);
+            break;
+        }
+        case ArithmeticOperation::DIV:
+        {
+            res = elementwise_div(pg, a, b);
+            break;
+        }
+        case ArithmeticOperation::POWER:
+        {
+            res = elementwise_pow(pg, a, b);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return res;
+}
+
+template <typename InputVectorType, typename OutputVectorType>
+OutputVectorType
+elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
+{
+    svbool_t selection_vector{};
+
+    switch (op)
+    {
+        case ComparisonOperation::Equal:
+            selection_vector = svcmpeq(pg, a, b);
+            break;
+        case ComparisonOperation::NotEqual:
+            selection_vector = svcmpne(pg, a, b);
+            break;
+        case ComparisonOperation::Greater:
+            selection_vector = svcmpgt(pg, a, b);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            selection_vector = svcmpge(pg, a, b);
+            break;
+        case ComparisonOperation::Less:
+            selection_vector = svcmplt(pg, a, b);
+            break;
+        case ComparisonOperation::LessEqual:
+            selection_vector = svcmple(pg, a, b);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type;
+    selection_vector      = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector);
+
+    using OutputScalarType  = typename wrapper::sve_scalar<OutputVectorType>::type;
+    const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0));
+    const auto true_vector  = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0));
+    auto       ret          = svsel(selection_vector, true_vector, false_vector);
+
+    return ret;
+}
+
+template <typename ScalarType>
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
+
+template <typename ScalarType, typename OutputScalarType = uint8_t>
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
new file mode 100644
index 0000000000..f7714ff7e9
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<int32_t>(in1, in2, out, op, window);
+}
+template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+
+template <ArithmeticOperation op>
+void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<int16_t>(in1, in2, out, op, window);
+}
+template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+
+template <ComparisonOperation op>
+void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<uint8_t>(in1, in2, out, op, window);
+}
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+
+template <ComparisonOperation op>
+void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<int16_t>(in1, in2, out, op, window);
+}
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+
+template <ComparisonOperation op>
+void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<int32_t>(in1, in2, out, op, window);
+}
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
new file mode 100644
index 0000000000..7c6015d379
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
+#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
+
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::wrapper;
+
+inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
+{
+    auto x = svld1(pg, ptr);
+
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
+
+    pg = svptrue_b8();
+
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
+}
+
+inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
+{
+    auto x = svld1(pg, ptr);
+
+    //vprint(x);
+
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
+
+    pg = svptrue_b8();
+
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
+}
+
+inline void
+store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+{
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+
+    const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
+    const auto narrowed_top    = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
+    const auto narrowed        = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
+    svst1(pg, ptr, narrowed);
+}
+
+inline void
+store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+{
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+
+    const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
+    const auto narrowed_top    = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
+    const auto narrowed        = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
+
+    svst1(pg, ptr, narrowed);
+}
+
+template <typename ScalarType>
+void elementwise_arithmetic_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+{
+    const auto all_true_pg = wrapper::svptrue<ScalarType>();
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
+    const auto output_vscale  = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+
+        const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
+        const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const float      broadcast_value_f =
+                    Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+                    svfloat32x4_t result{};
+
+                    if (!is_broadcast_input_2)
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset);
+        const auto in1_vscale  = svdup_n(in1->info()->quantization_info().uniform().scale);
+
+        const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
+        const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+
+                    const auto result =
+                        svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
+    }
+}
+
+template <typename InputScalarType, typename OutputScalarType = uint8_t>
+void elementwise_comparison_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+{
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
+
+    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
+    const auto all_true_pg = wrapper::svptrue<InputScalarType>();
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+
+        const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
+        const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const float broadcast_value_f =
+                    Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+                    svuint8x4_t result{};
+
+                    if (!is_broadcast_input_2)
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0),
+                                                                                                    svget4(in1, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1),
+                                                                                                    svget4(in1, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2),
+                                                                                                    svget4(in1, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                                    svget4(in2, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                                    svget4(in2, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                                    svget4(in2, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset);
+        const auto in1_vscale  = svdup_n(in1->info()->quantization_info().uniform().scale);
+
+        const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
+        const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+                    const auto result =
+                        svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                           svget4(in2, 0), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                           svget4(in2, 1), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                           svget4(in2, 2), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3),
+                                                                                           svget4(in2, 3), op));
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
new file mode 100644
index 0000000000..5cc66642d7
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_quantized_op<uint8_t>(in1, in2, out, op, window);
+}
+
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+
+template <ComparisonOperation op>
+void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
+{
+    return elementwise_comparison_quantized_op<uint8_t>(in1, in2, out, op, window);
+}
+
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..165e0c05fa
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_quantized_op<int8_t>(in1, in2, out, op, window);
+}
+
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+
+template <ComparisonOperation op>
+void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
+{
+    return elementwise_comparison_quantized_op<int8_t>(in1, in2, out, op, window);
+}
+
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/list.h b/src/cpu/kernels/elementwise_binary/list.h
new file mode 100644
index 0000000000..78a098e7bb
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/list.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+#define SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ELEMETWISE_BINARY_KERNEL(func_name) \
+    template <ArithmeticOperation op>               \
+    void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s32_elementwise_binary);
+
+#undef DECLARE_ELEMETWISE_BINARY_KERNEL
+
+#define DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(func_name) \
+    template <ComparisonOperation op>                     \
+    void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_u8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_u8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp32_comparison_elementwise_binary);
+#undef DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..2588db024d
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_op<__fp16>(in, out, window, op);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..936a2e588a
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_op<float>(in, out, window, op);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
new file mode 100644
index 0000000000..d54d3984cb
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
+#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
+{
+    switch (op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return 1 / sqrt(a);
+        case ElementWiseUnary::EXP:
+            return std::exp(a);
+        case ElementWiseUnary::NEG:
+            return -a;
+        case ElementWiseUnary::LOG:
+            return std::log(a);
+        case ElementWiseUnary::ABS:
+            return std::abs(a);
+        case ElementWiseUnary::ROUND:
+            return support::cpp11::nearbyint(a);
+        case ElementWiseUnary::SIN:
+            return std::sin(a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+template <typename ScalarType, typename VectorType>
+inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
+{
+    switch (op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return wrapper::vinvsqrt(a);
+        case ElementWiseUnary::EXP:
+            return wrapper::vexpq(a);
+        case ElementWiseUnary::NEG:
+            return wrapper::vneg(a);
+        case ElementWiseUnary::LOG:
+            return wrapper::vlog(a);
+        case ElementWiseUnary::ABS:
+            return wrapper::vabs(a);
+        case ElementWiseUnary::ROUND:
+            return wrapper::vround(a);
+        case ElementWiseUnary::SIN:
+            return wrapper::vsin(a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+template <typename ScalarType>
+inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int  window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
+            }
+        },
+        input, output);
+}
+
+template <>
+inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int                     window_step_x     = 16;
+    const auto                    window_start_x    = static_cast<int>(window.x().start());
+    const auto                    window_end_x      = static_cast<int>(window.x().end());
+    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
+    const auto                    min_clamped_value = vdupq_n_f32((-128 - qi_out.offset) * qi_out.scale);
+    const auto                    max_clamped_value = vdupq_n_f32((127 - qi_out.offset) * qi_out.scale);
+    Window                        win               = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int8x16_t  vout;
+            auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto vconst_0_f32  = vdupq_n_f32(0);
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
+                    elementwise_op_imp<float>(op, vin_deq.val[0]),
+                    elementwise_op_imp<float>(op, vin_deq.val[1]),
+                    elementwise_op_imp<float>(op, vin_deq.val[2]),
+                    elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }};
+
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+                }
+
+                // Re-quantize to new output space
+                vout = vquantize_signed(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp   = 0;
+                float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                if (tmp_f <= 0.0)
+                {
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (-128 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (127 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
+                }
+                else
+                {
+                    tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                }
+                tmp = quantize_qasymm8_signed(
+                    tmp_f, qi_out,
+                    RoundingPolicy::
+                        TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
+                // For aarch64 LUT is used and rounding to nearest is used
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
+template <>
+inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int                     window_step_x     = 16;
+    const auto                    window_start_x    = static_cast<int>(window.x().start());
+    const auto                    window_end_x      = static_cast<int>(window.x().end());
+    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
+    const auto                    vconst_0_f32      = vdupq_n_f32(0);
+    const auto                    min_clamped_value = vdupq_n_f32((0 - qi_out.offset) * qi_out.scale);
+    const auto                    max_clamped_value = vdupq_n_f32((255 - qi_out.offset) * qi_out.scale);
+    Window                        win               = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            uint8x16_t vout;
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+            auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
+            int        x             = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
+                    elementwise_op_imp<float>(op, vin_deq.val[0]),
+                    elementwise_op_imp<float>(op, vin_deq.val[1]),
+                    elementwise_op_imp<float>(op, vin_deq.val[2]),
+                    elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }};
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+                }
+
+                // Re-quantize to new output space
+                vout = vquantize(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp   = 0;
+                float     tmp_f = dequantize_qasymm8(in, qi_in);
+                if (tmp_f <= 0.0)
+                {
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (0 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (255 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
+                }
+                else
+                {
+                    tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                }
+                tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
new file mode 100644
index 0000000000..d4daad4ca6
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_op<int32_t>(in, out, window, op);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
new file mode 100644
index 0000000000..38cb61d0ff
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#ifdef __aarch64__
+
+void neon_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(op);
+
+    auto       win          = window;
+    const auto window_end_x = window.x().end();
+    win.set(0, Window::Dimension(0, 1, 1));
+
+    Iterator src_it(in, win);
+    Iterator dst_it(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
+
+            lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
+}
+
+#endif // __aarch64__
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..3e4b88eb47
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Window.h"
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifndef __aarch64__
+// Fallback function to be used for armv7a, for aarch64 LUT is used
+void neon_qasymm8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_op<uint8_t>(in, out, window, op);
+}
+#endif // #ifndef __aarch64__
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..a5f4b053e3
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Window.h"
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifndef __aarch64__
+// Fallback function to be used for armv7a, for aarch64 LUT is used
+void neon_qasymm8_signed_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_op<int8_t>(in, out, window, op);
+}
+#endif // #ifndef __aarch64__
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
new file mode 100644
index 0000000000..22ff43c5d9
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_sve_op<float16_t>(in, out, window, op);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
new file mode 100644
index 0000000000..394bd47adf
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_sve_op<float32_t>(in, out, window, op);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
new file mode 100644
index 0000000000..5af534d9e7
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, typename VectorType>
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+{
+    switch (op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return svinvsqrt(pg, a);
+        case ElementWiseUnary::EXP:
+            return wrapper::svexp_z(pg, a);
+        case ElementWiseUnary::NEG:
+            return svneg_z(pg, a);
+        case ElementWiseUnary::LOG:
+            return wrapper::svlog_z(pg, a);
+        case ElementWiseUnary::ABS:
+            return svabs_z(pg, a);
+        case ElementWiseUnary::ROUND:
+            return svrintn_z(pg, a);
+        case ElementWiseUnary::SIN:
+            return wrapper::svsin_z(pg, a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
+    }
+}
+
+template <typename ScalarType, typename VectorType>
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+{
+    switch (op)
+    {
+        case ElementWiseUnary::NEG:
+            return svneg_z(pg, a);
+        case ElementWiseUnary::ABS:
+            return svabs_z(pg, a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
+    }
+}
+
+template <typename ScalarType>
+void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            int        x          = window_start_x;
+
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto vin = svld1(pg, input_ptr + x);
+                svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
+        },
+        input, output);
+}
+
+template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.h b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h
new file mode 100644
index 0000000000..f2068dc63f
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
+#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
new file mode 100644
index 0000000000..e27fe5a87f
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_sve_op<int32_t>(in, out, window, op);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
new file mode 100644
index 0000000000..4e4582debb
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(op);
+
+    auto       win          = window;
+    const auto window_end_x = window.x().end();
+    win.set(0, Window::Dimension(0, 1, 1));
+
+    Iterator src_it(in, win);
+    Iterator dst_it(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
+
+            lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/list.h b/src/cpu/kernels/elementwise_unary/list.h
new file mode 100644
index 0000000000..a9701afdd8
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/list.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
+#define SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ELEMETWISE_UNARY_KERNEL(func_name) \
+    void func_name(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp16_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_s32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve2_q8_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp16_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_s32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_q8_elementwise_unary);
+#ifndef __aarch64__
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_qasymm8_signed_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_qasymm8_elementwise_unary);
+#endif // __aarch64__
+#undef DECLARE_ELEMETWISE_UNARY_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h
new file mode 100644
index 0000000000..5ac78df324
--- /dev/null
+++ b/src/cpu/kernels/floor/list.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
+#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len)
+
+DECLARE_FLOOR_KERNEL(fp16_neon_floor);
+DECLARE_FLOOR_KERNEL(fp32_neon_floor);
+
+#undef DECLARE_FLOOR_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */
diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp
new file mode 100644
index 0000000000..f47690277d
--- /dev/null
+++ b/src/cpu/kernels/floor/neon/fp16.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/common/utils/Validate.h"
+#include "src/core/NEON/NEMath.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+constexpr int step = 8;
+
+void fp16_neon_floor(const void *src, void *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    auto psrc = static_cast<const __fp16 *>(src);
+    auto pdst = static_cast<__fp16 *>(dst);
+
+    for (; len >= step; len -= step)
+    {
+        vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
+        psrc += step;
+        pdst += step;
+    }
+
+    for (; len > 0; --len)
+    {
+        *pdst = std::floor(*psrc);
+        ++psrc;
+        ++pdst;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp
new file mode 100644
index 0000000000..a86e24d3c3
--- /dev/null
+++ b/src/cpu/kernels/floor/neon/fp32.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/common/utils/Validate.h"
+#include "src/core/NEON/NEMath.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+constexpr int step = 4;
+
+void fp32_neon_floor(const void *src, void *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    auto psrc = static_cast<const float *>(src);
+    auto pdst = static_cast<float *>(dst);
+
+    for (; len >= step; len -= step)
+    {
+        vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
+        psrc += step;
+        pdst += step;
+    }
+
+    for (; len > 0; --len)
+    {
+        *pdst = std::floor(*psrc);
+        ++pdst;
+        ++psrc;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
new file mode 100644
index 0000000000..8f47ecba8f
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fused_batch_normalization_conv_f16(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
+{
+    return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
+}
+
+void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
+{
+    return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
new file mode 100644
index 0000000000..3ca5b6977a
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fused_batch_normalization_conv_f32(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
+{
+    return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
new file mode 100644
index 0000000000..0c90abccb1
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
+#define ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T, bool fused_activation, typename F>
+void batch_normalization_nchw(const Window       &window,
+                              ITensor            *in,
+                              ITensor            *out,
+                              const ITensor      *in_mean,
+                              const ITensor      *in_var,
+                              const ITensor      *in_beta,
+                              const ITensor      *in_gamma,
+                              float               epsilon,
+                              ActivationLayerInfo act_info)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const int  window_step_x  = 16 / sizeof(T);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_to_use = window;
+    win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win_to_use);
+    Iterator output(out, win_to_use);
+
+    F activation_functor(act_info);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and constants once per feature map.
+    int slice = -1;
+
+    const auto input_mean = reinterpret_cast<const T *>(in_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const T *>(in_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (in_gamma != nullptr) ? reinterpret_cast<const T *>(in_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (in_beta != nullptr) ? reinterpret_cast<const T *>(in_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+    T mean        = static_cast<T>(0);
+    T var         = static_cast<T>(0);
+    T gamma       = static_cast<T>(1);
+    T beta        = static_cast<T>(0);
+    T denominator = static_cast<T>(0);
+
+    auto       mean_vec        = wrapper::vdup_n(mean, ExactTagType{});
+    auto       var_vec         = wrapper::vdup_n(var, ExactTagType{});
+    auto       gamma_vec       = wrapper::vdup_n(gamma, ExactTagType{});
+    auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
+    auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
+    const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+    execute_window_loop(
+        win_to_use,
+        [&](const Coordinates &id)
+        {
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            if (slice != id.z())
+            {
+                mean     = input_mean[id.z()];
+                var      = input_var[id.z()];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                var_vec  = wrapper::vdup_n(var, ExactTagType{});
+                if (input_gamma != nullptr)
+                {
+                    gamma     = input_gamma[id.z()];
+                    gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+                }
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id.z()];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Calculate denominator
+                denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                denominator     = wrapper::vgetlane(denominator_vec, 0);
+                slice           = id.z();
+            }
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const T numerator = input_ptr[x] - mean;
+                const T x_bar     = numerator * denominator;
+                T       res       = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *(output_ptr + x) = res;
+            }
+        },
+        input, output);
+}
+
+template <typename T>
+void fused_batch_normalization_conv(const ITensor *conv_weights,
+                                    const ITensor *conv_bias,
+                                    ITensor       *fused_weights,
+                                    ITensor       *fused_bias,
+                                    const ITensor *bn_mean,
+                                    const ITensor *bn_var,
+                                    const ITensor *bn_beta,
+                                    const ITensor *bn_gamma,
+                                    float          epsilon,
+                                    const Window  &window)
+{
+    using ScalarType   = T;
+    const int size     = 16 / conv_weights->info()->element_size();
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+    const bool run_in_place_bias    = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator conv_w_in(conv_weights, win);
+    Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
+
+    const auto conv_bias_in =
+        (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto conv_bias_out =
+        (run_in_place_bias ? conv_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+
+    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+    auto mean                = ScalarType(0.0);
+    auto var                 = ScalarType(0.0);
+    auto gamma               = ScalarType(1.0);
+    auto beta                = ScalarType(0.0);
+    auto conv_bias_in_scalar = ScalarType(0.0);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            var = input_var[id[3]];
+            if (input_gamma != nullptr)
+            {
+                gamma = input_gamma[id[3]];
+            }
+
+            if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
+            {
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[3]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Construct vectors
+                mean     = input_mean[id[3]];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+
+                if (conv_bias_in != nullptr)
+                {
+                    conv_bias_in_scalar = conv_bias_in[id[3]];
+                }
+                auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
+            }
+
+            int  x              = window_start_x;
+            auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
+            auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
+            var_vec             = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto wn = wrapper::vloadq(conv_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
+
+                // Store results
+                wrapper::vstore(conv_w_out_ptr + x, wn);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        conv_w_in, conv_w_out);
+}
+template <typename T>
+void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
+{
+    using ScalarType   = T;
+    const int size     = 16 / dwc_weights->info()->element_size();
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
+    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator dwc_w_in(dwc_weights, win);
+    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
+
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+
+    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+    auto mean               = ScalarType(0.0);
+    auto var                = ScalarType(0.0);
+    auto gamma              = ScalarType(1.0);
+    auto beta               = ScalarType(0.0);
+    auto dwc_bias_in_scalar = ScalarType(0.0);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            var = input_var[id[2]];
+            if (input_gamma != nullptr)
+            {
+                gamma = input_gamma[id[2]];
+            }
+
+            if (id[1] == 0)
+            {
+                mean = input_mean[id[2]];
+
+                // Construct vectors
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[2]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                if (dwc_bias_in != nullptr)
+                {
+                    dwc_bias_in_scalar = dwc_bias_in[id[2]];
+                }
+
+                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
+            }
+
+            int  x             = window_start_x;
+            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+            var_vec            = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
+
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        dwc_w_in, dwc_w_out);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h
new file mode 100644
index 0000000000..a03dd74f78
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/list.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_LIST_H
+#define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name)                                                            \
+    void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,     \
+                   float epsilon, const Window &window)
+
+#define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name)                                                 \
+    void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
+
+#define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name)                                                 \
+    void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
+
+DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16);
+DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32);
+DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(fused_batch_normalization_dwc_nhwc_f16);
+DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(fused_batch_normalization_dwc_nhwc_f32);
+DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_nchw_f16);
+DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_nchw_f32);
+
+#undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL
+#undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL
+#undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif //
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
new file mode 100644
index 0000000000..25580e1bec
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
+{
+    return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp
new file mode 100644
index 0000000000..ae4c7e5736
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_batch_normalization_nchw_non_fused(const Window       &window,
+                                             ITensor            *input,
+                                             ITensor            *output,
+                                             const ITensor      *mean,
+                                             const ITensor      *var,
+                                             const ITensor      *beta,
+                                             const ITensor      *gamma,
+                                             float               epsilon,
+                                             ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>(window, input, output, mean, var, beta,
+                                                                            gamma, epsilon, act_info);
+}
+
+void fp16_batch_normalization_nchw_non_fused_relu(const Window       &window,
+                                                  ITensor            *input,
+                                                  ITensor            *output,
+                                                  const ITensor      *mean,
+                                                  const ITensor      *var,
+                                                  const ITensor      *beta,
+                                                  const ITensor      *gamma,
+                                                  float               epsilon,
+                                                  ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>(window, input, output, mean, var, beta, gamma,
+                                                                          epsilon, act_info);
+}
+
+void fp16_batch_normalization_nchw_non_fused_brelu(const Window       &window,
+                                                   ITensor            *input,
+                                                   ITensor            *output,
+                                                   const ITensor      *mean,
+                                                   const ITensor      *var,
+                                                   const ITensor      *beta,
+                                                   const ITensor      *gamma,
+                                                   float               epsilon,
+                                                   ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>(window, input, output, mean, var, beta,
+                                                                           gamma, epsilon, act_info);
+}
+
+void fp16_batch_normalization_nchw_non_fused_lubrelu(const Window       &window,
+                                                     ITensor            *input,
+                                                     ITensor            *output,
+                                                     const ITensor      *mean,
+                                                     const ITensor      *var,
+                                                     const ITensor      *beta,
+                                                     const ITensor      *gamma,
+                                                     float               epsilon,
+                                                     ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>(window, input, output, mean, var, beta,
+                                                                             gamma, epsilon, act_info);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp
new file mode 100644
index 0000000000..ae2db1ac66
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_batch_normalization_nchw_non_fused(const Window       &window,
+                                             ITensor            *input,
+                                             ITensor            *output,
+                                             const ITensor      *mean,
+                                             const ITensor      *var,
+                                             const ITensor      *beta,
+                                             const ITensor      *gamma,
+                                             float               epsilon,
+                                             ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, false, detail::dummy<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                    epsilon, act_info);
+}
+
+void fp32_batch_normalization_nchw_non_fused_relu(const Window       &window,
+                                                  ITensor            *input,
+                                                  ITensor            *output,
+                                                  const ITensor      *mean,
+                                                  const ITensor      *var,
+                                                  const ITensor      *beta,
+                                                  const ITensor      *gamma,
+                                                  float               epsilon,
+                                                  ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, true, detail::relu<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                  epsilon, act_info);
+}
+
+void fp32_batch_normalization_nchw_non_fused_brelu(const Window       &window,
+                                                   ITensor            *input,
+                                                   ITensor            *output,
+                                                   const ITensor      *mean,
+                                                   const ITensor      *var,
+                                                   const ITensor      *beta,
+                                                   const ITensor      *gamma,
+                                                   float               epsilon,
+                                                   ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, true, detail::brelu<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                   epsilon, act_info);
+}
+
+void fp32_batch_normalization_nchw_non_fused_lubrelu(const Window       &window,
+                                                     ITensor            *input,
+                                                     ITensor            *output,
+                                                     const ITensor      *mean,
+                                                     const ITensor      *var,
+                                                     const ITensor      *beta,
+                                                     const ITensor      *gamma,
+                                                     float               epsilon,
+                                                     ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                     epsilon, act_info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
new file mode 100644
index 0000000000..1d88d3b494
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+#include "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
+{
+    return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
new file mode 100644
index 0000000000..1f336bb196
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+#include "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
+{
+    return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
new file mode 100644
index 0000000000..5b74a7aef6
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
+#define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
+{
+    using ScalarType   = T;
+    const int size     = 16 / dwc_weights->info()->element_size();
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
+    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator dwc_w_in(dwc_weights, win);
+    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
+
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+
+    auto       mean_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       var_vec      = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       gamma_vec    = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+    auto       beta_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       rvar_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       dwc_bias_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    const auto epsilon_vec  = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+    auto gamma              = ScalarType(1.0);
+    auto beta               = ScalarType(0.0);
+    auto dwc_bias_in_scalar = ScalarType(0);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                var_vec = wrapper::vloadq(input_var + x);
+                if (input_gamma != nullptr)
+                {
+                    gamma_vec = wrapper::vloadq(input_gamma + x);
+                }
+
+                if ((id[2] == 0) && (id[1] == 0))
+                {
+                    mean_vec = wrapper::vloadq(input_mean + x);
+
+                    // Construct vectors
+                    if (input_beta != nullptr)
+                    {
+                        beta_vec = wrapper::vloadq(input_beta + x);
+                    }
+
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+                    }
+
+                    auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec),
+                                                          wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
+                    dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
+                    wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
+                }
+
+                auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+
+                auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
+                rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                wn       = wrapper::vmul(wn, rvar_vec);
+                wn       = wrapper::vmul(wn, gamma_vec);
+
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto var = input_var[x];
+                if (input_gamma != nullptr)
+                {
+                    gamma = input_gamma[x];
+                }
+
+                if (id[2] == 0 && id[1] == 0)
+                {
+                    auto mean = input_mean[x];
+                    if (input_beta != nullptr)
+                    {
+                        beta = input_beta[x];
+                    }
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_in_scalar = dwc_bias_in[x];
+                    }
+
+                    auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                    dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
+                }
+
+                const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        dwc_w_in, dwc_w_out);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..4d7507a5da
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    const float16x8_t beta_f16 = vdupq_n_f16(beta);
+
+    constexpr int window_step_x  = 16;
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window.collapse_if_possible(window, Window::DimZ);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
+                const float16x8x2_t c        = vld2q_f16(in_ptr + x);
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+                alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+
+                vst2q_f16(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
+            }
+        },
+        in, out);
+}
+} // namespace
+void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta)
+{
+    return matrix_addition_f16(src, dst, window, beta);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..fa3f4de11f
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta)
+{
+    return matrix_addition_f32(src, dst, window, beta);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
new file mode 100644
index 0000000000..47de0f3928
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    const float32x4_t beta_f32 = vdupq_n_f32(beta);
+
+    constexpr int window_step_x  = 16;
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window.collapse_if_possible(window, Window::DimZ);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
+                const float32x4x4_t c        = vld4q_f32(in_ptr + x);
+
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+                alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+                alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+                alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+
+                vst4q_f32(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * beta;
+            }
+        },
+        in, out);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h
new file mode 100644
index 0000000000..26ac99b483
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_GEMMMATRIXADD_IMPL_H
+#define SRC_CORE_KERNELS_GEMMMATRIXADD_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_KERNELS_GEMMMATRIXADD_IMPL_H
diff --git a/src/cpu/kernels/gemm_matrix_add/list.h b/src/cpu/kernels/gemm_matrix_add/list.h
new file mode 100644
index 0000000000..415b4c8321
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_add/list.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_GEMMMATRIXADD_LIST_H
+#define SRC_CORE_NEON_KERNELS_GEMMMATRIXADD_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_GEMMMATRIXADD_KERNEL(func_name) \
+    void func_name(const ITensor *src, ITensor *dst, const Window &window, float beta)
+DECLARE_GEMMMATRIXADD_KERNEL(neon_fp32_gemm_matrix_add);
+DECLARE_GEMMMATRIXADD_KERNEL(neon_fp16_gemm_matrix_add);
+#undef DECLARE_GEMMMATRIXADD_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_GEMMMATRIXADD_LIST_H
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..60fda511e3
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void vector_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+{
+    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size());
+    const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
+
+    // The implementation computes 32 elements per iteration
+    const int window_start_x = 32 * info.thread_id;
+    const int window_step_x  = 32 * info.num_threads;
+    const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x,
+                             " (window_end_x - window_start_x) must be multiple of window_step_x");
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if (rhs->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(lhs, win_a);
+    Iterator inb(rhs, win_b);
+    Iterator out(dst, win_out);
+
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
+        {
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
+
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+
+                float16x8_t acc0 = vdupq_n_f16(0.f);
+                float16x8_t acc1 = vdupq_n_f16(0.f);
+                float16x8_t acc2 = vdupq_n_f16(0.f);
+                float16x8_t acc3 = vdupq_n_f16(0.f);
+
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    const float16x4_t a0l = vld1_f16(vec_a);
+
+                    float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+                    matrix_b += 2 * in_b_stride;
+
+                    b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+                    vec_a += 4;
+                    matrix_b += 2 * in_b_stride;
+                }
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t   a0  = *vec_a;
+                    const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+                    acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+                    acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+                    acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f16(acc0, alpha_f16);
+                    acc1 = vmulq_f16(acc1, alpha_f16);
+                    acc2 = vmulq_f16(acc2, alpha_f16);
+                    acc3 = vmulq_f16(acc3, alpha_f16);
+                }
+
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+                vst1q_f16(vec_out + 0, acc0);
+                vst1q_f16(vec_out + 8, acc1);
+                vst1q_f16(vec_out + 16, acc2);
+                vst1q_f16(vec_out + 24, acc3);
+            }
+
+            for (; x < window_end_x; ++x)
+            {
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
+
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+
+                float16x4_t vacc = vdup_n_f16(0.f);
+
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
+                {
+                    const float16x4_t a0l = vld1_f16(vec_a);
+
+                    const float16x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
+
+                    vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
+
+                    matrix_b += 4 * in_b_stride;
+                }
+
+                float16_t acc =
+                    vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t a0  = *vec_a;
+                    const float16_t b00 = *matrix_b;
+
+                    acc += b00 * a0;
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= static_cast<float16_t>(alpha);
+                }
+
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+                *(vec_out) = acc;
+            }
+        },
+        ina, inb, out);
+}
+
+void matrix_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+{
+    ARM_COMPUTE_UNUSED(info);
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride  = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if (rhs->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(lhs, win_a);
+    Iterator inb(rhs, win_b);
+    Iterator out(dst, window);
+
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
+            const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
+            auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
+            float16x8x4_t c       = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}};
+
+            /*
+        This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+             |a00 a01 a02 a03 | a04 a05 a06 a07|
+             |a10 a11 a12 a13 | a14 a15 a16 a17|
+             |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...
+             |a30 a31 a32 a33 | a34 a35 a36 a37|   | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...
+             |a40 a41 a42 a43 | a44 a45 a46 a47|
+             |a50 a51 a52 a53 | a54 a55 a56 a57|
+             |a60 a61 a62 a63 | a64 a65 a66 a67|
+             |a70 a71 a72 a73 | a74 a75 a76 a77|
+
+             After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ]
+
+        B Matrix has been transposed as shown below
+
+           |b00 b01 b02 b03 b04 b05 b06 b07|
+           |b10 b11 b12 b13 b14 b15 b16 b17|
+           |b20 b21 b22 b23 b24 b25 b26 b27|
+           |b30 b31 b32 b33 b34 b35 b36 b37|
+          ------------------->
+
+           |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|
+
+            c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30
+            c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31
+
+        The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
+        */
+            const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
+
+            {
+                const float16x8_t p00 = vld1q_f16(mtx_a0);
+                const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
+                const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
+                const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
+                const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+
+                mtx_a0 += 16;
+                mtx_b0 += 32;
+            }
+
+            for (; mtx_b0 < mtx_b0_end_addr;)
+
+            {
+                const float16x4_t p00 = vld1_f16(mtx_a0);
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
+
+                mtx_a0 += 4;
+                mtx_b0 += 8;
+            }
+
+            if (multiply_alpha)
+            {
+                c.val[0] = vmulq_f16(c.val[0], alpha_f16);
+                c.val[1] = vmulq_f16(c.val[1], alpha_f16);
+                c.val[2] = vmulq_f16(c.val[2], alpha_f16);
+                c.val[3] = vmulq_f16(c.val[3], alpha_f16);
+            }
+
+            if (id.x() < (out_width - 8))
+            {
+                vst1q_f16(mtx_out, c.val[0]);
+                if (id.y() + 1 < out_height)
+                {
+                    vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+                    if (id.y() + 2 < out_height)
+                    {
+                        vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+                        if (id.y() + 3 < out_height)
+                        {
+                            vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
+                {
+                    *(mtx_out + x) = c.val[0][x];
+                    if (id.y() + 1 < out_height)
+                    {
+                        *(mtx_out + x + 1 * out_stride) = c.val[1][x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out + x + 2 * out_stride) = c.val[2][x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        ina, inb, out);
+}
+
+void neon_fp16_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
+{
+    return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..e12a312280
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
+{
+    return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
new file mode 100644
index 0000000000..404d070a37
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
+
+#include "src/core/utils/helpers/float_ops.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+{
+    const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0));
+    const auto in_b_stride =
+        static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * info.thread_id;
+    const int window_step_x  = 16 * info.num_threads;
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if (rhs->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(lhs, win_a);
+    Iterator inb(rhs, win_b);
+    Iterator out(dst, win_out);
+
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
+        {
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
+
+                float32x4_t acc0 = vdupq_n_f32(0.f);
+                float32x4_t acc1 = vdupq_n_f32(0.f);
+                float32x4_t acc2 = vdupq_n_f32(0.f);
+                float32x4_t acc3 = vdupq_n_f32(0.f);
+
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif /* __arm__ */
+
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    float32x2_t a0l = vld1_f32(vec_a);
+
+                    float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+                    float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif /* __arm__ */
+
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
+
+                    a0l = vld1_f32(vec_a);
+
+                    b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+                    b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
+                }
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
+
+                    const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+                    acc0 = vmlaq_n_f32(acc0, b00, a0);
+                    acc1 = vmlaq_n_f32(acc1, b01, a0);
+                    acc2 = vmlaq_n_f32(acc2, b02, a0);
+                    acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f32(acc0, alpha_f32);
+                    acc1 = vmulq_f32(acc1, alpha_f32);
+                    acc2 = vmulq_f32(acc2, alpha_f32);
+                    acc3 = vmulq_f32(acc3, alpha_f32);
+                }
+
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+
+                vst1q_f32(vec_out + 0, acc0);
+                vst1q_f32(vec_out + 4, acc1);
+                vst1q_f32(vec_out + 8, acc2);
+                vst1q_f32(vec_out + 12, acc3);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
+
+                float32x4_t vacc = vdupq_n_f32(0.f);
+
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif /* __arm__ */
+
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
+                {
+                    const float32x4_t a0l = vld1q_f32(vec_a);
+
+                    const float32x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif /* __arm__ */
+
+                    vacc = vmlaq_f32(vacc, b_col, a0l);
+
+                    matrix_b += 4 * in_b_stride;
+                }
+
+                float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) +
+                            vgetq_lane_f32(vacc, 3);
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
+
+                    const float b00 = *matrix_b;
+
+                    acc += b00 * a0;
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= alpha;
+                }
+
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+
+                *vec_out = acc;
+            }
+        },
+        ina, inb, out);
+}
+
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+{
+    ARM_COMPUTE_UNUSED(info);
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const size_t out_stride2 = out_stride1 * 2;
+    const size_t out_stride3 = out_stride1 * 3;
+    const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if (rhs->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the dst matrix
+    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(lhs, win_a);
+    Iterator inb(rhs, win_b);
+    Iterator out(dst, window);
+
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
+            auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
+            auto mtx_b1 = mtx_b0 + in_b_stride;
+
+            float32x4_t acc00 = vdupq_n_f32(0.f);
+            float32x4_t acc10 = vdupq_n_f32(0.f);
+            float32x4_t acc20 = vdupq_n_f32(0.f);
+            float32x4_t acc30 = vdupq_n_f32(0.f);
+
+            float32x4_t acc01 = vdupq_n_f32(0.f);
+            float32x4_t acc11 = vdupq_n_f32(0.f);
+            float32x4_t acc21 = vdupq_n_f32(0.f);
+            float32x4_t acc31 = vdupq_n_f32(0.f);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+
+            auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
+            {
+                float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
+                float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+                float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+                float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+                float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+                float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0 = vld1q_dup_f32(mtx_a0 + 0);
+                a1 = vld1q_dup_f32(mtx_a0 + 1);
+                a2 = vld1q_dup_f32(mtx_a0 + 2);
+                a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+            }
+
+            for (; mtx_b0 < mtx_b0_end_addr;)
+            {
+                float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
+
+#if __arm__
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                mtx_a0 += 4;
+                mtx_b0 += 4;
+                mtx_b1 += 4;
+            }
+
+            // Multiply by the weight of matrix product (alpha)
+            if (multiply_alpha)
+            {
+                acc00 = vmulq_f32(acc00, alpha_f32);
+                acc10 = vmulq_f32(acc10, alpha_f32);
+                acc20 = vmulq_f32(acc20, alpha_f32);
+                acc30 = vmulq_f32(acc30, alpha_f32);
+                acc01 = vmulq_f32(acc01, alpha_f32);
+                acc11 = vmulq_f32(acc11, alpha_f32);
+                acc21 = vmulq_f32(acc21, alpha_f32);
+                acc31 = vmulq_f32(acc31, alpha_f32);
+            }
+
+            const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
+            const auto mtx_out1 = mtx_out0 + 4;
+
+            if (id.x() < (out_width - 8))
+            {
+                vst1q_f32(mtx_out0, acc00);
+                vst1q_f32(mtx_out1, acc01);
+                if (id.y() + 1 < out_height)
+                {
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    vst1q_f32(mtx_out1 + out_stride1, acc11);
+                    if (id.y() + 2 < out_height)
+                    {
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        vst1q_f32(mtx_out1 + out_stride2, acc21);
+                        if (id.y() + 3 < out_height)
+                        {
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
+                            vst1q_f32(mtx_out1 + out_stride3, acc31);
+                        }
+                    }
+                }
+            }
+            else if (id.x() < (out_width - 4))
+            {
+                vst1q_f32(mtx_out0, acc00);
+                if (id.y() + 1 < out_height)
+                {
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    if (id.y() + 2 < out_height)
+                    {
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        if (id.y() + 3 < out_height)
+                        {
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
+                        }
+                    }
+                }
+                // Left-over columns
+                const int columns_left = out_width - id.x() - 4;
+                for (auto x = 0; x < columns_left; ++x)
+                {
+                    *(mtx_out1 + x) = acc01[x];
+                    if (id.y() + 1 < out_height)
+                    {
+                        *(mtx_out1 + x + out_stride1) = acc11[x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out1 + x + out_stride2) = acc21[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out1 + x + out_stride3) = acc31[x];
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
+                {
+                    *(mtx_out0 + x) = acc00[x];
+                    if (id.y() + 1 < out_height)
+                    {
+                        *(mtx_out0 + x + out_stride1) = acc10[x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out0 + x + out_stride2) = acc20[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out0 + x + out_stride3) = acc30[x];
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        ina, inb, out);
+}
+} // namespace cpu
+
+} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
new file mode 100644
index 0000000000..74ea4c2b17
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
+#define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/CPP/Validate.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h
new file mode 100644
index 0000000000..15b23b1d81
--- /dev/null
+++ b/src/cpu/kernels/gemm_matrix_mul/list.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_GEMMMATRIXMUL_LIST_H
+#define SRC_CORE_NEON_KERNELS_GEMMMATRIXMUL_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name)                                                                        \
+    void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \
+                   float alpha, const bool is_dst_vector)
+DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul);
+DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul);
+#undef DECLARE_GEMMMATRIXMUL_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_GEMMMATRIXMUL_LIST_H
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..4ed7e54f1c
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
+{
+    return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..f15cd63bb2
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
+{
+    return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
new file mode 100644
index 0000000000..8cb76f3afb
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window)
+{
+    Iterator all_anchors_it(all_anchors, window);
+    Iterator anchors_it(all_anchors, window);
+
+    const size_t num_anchors = anchors->info()->dimension(1);
+    const float  stride      = 1.f / anchors_info.spatial_scale();
+    const size_t feat_width  = anchors_info.feat_width();
+
+    const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
+
+            const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
+            const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+
+            const size_t shift_idy = id.y() / num_anchors;
+            const float  shiftx    = (shift_idy % feat_width) * stride;
+            const float  shifty    = (shift_idy / feat_width) * stride;
+
+            const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
+            const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
+            const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
+            const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
+
+            *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
+            *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
+            *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
+            *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
+        },
+        all_anchors_it);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h
new file mode 100644
index 0000000000..3317bcfbe6
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
+#define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void compute_all_anchors(const ITensor     *anchors,
+                         ITensor           *all_anchors,
+                         ComputeAnchorsInfo anchors_info,
+                         const Window      &window)
+{
+    Iterator all_anchors_it(all_anchors, window);
+    Iterator anchors_it(all_anchors, window);
+
+    const size_t num_anchors = anchors->info()->dimension(1);
+    const T      stride      = 1.f / anchors_info.spatial_scale();
+    const size_t feat_width  = anchors_info.feat_width();
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
+
+            const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
+            const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+
+            const size_t shift_idy = id.y() / num_anchors;
+            const T      shiftx    = (shift_idy % feat_width) * stride;
+            const T      shifty    = (shift_idy / feat_width) * stride;
+
+            *out_anchor_ptr       = *anchor_ptr + shiftx;
+            *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
+            *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
+            *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
+        },
+        all_anchors_it);
+}
+
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
new file mode 100644
index 0000000000..7182d0b27d
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
+{
+    return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/list.h b/src/cpu/kernels/genproposals/list.h
new file mode 100644
index 0000000000..570c686e89
--- /dev/null
+++ b/src/cpu/kernels/genproposals/list.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H
+#define SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(func_name) \
+    void func_name(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+
+DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_qu16_computeallanchors);
+DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_fp16_computeallanchors);
+DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_fp32_computeallanchors);
+
+#undef DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H */
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..44418c0bb9
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+template <typename InputType, typename AccType>
+void vector_float_sum_fp16(AccType &result, AccType &result_square, const InputType &inputs)
+{
+    result        = wrapper::vadd(result, inputs);
+    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
+}
+
+template <typename InputType, typename AccType>
+InputType vector_float_norm_fp16(const InputType &inputs,
+                                 const AccType   &vec_mean,
+                                 const AccType   &vec_multip,
+                                 const AccType   &vec_beta)
+{
+    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
+}
+
+template <>
+inline void vector_float_sum_fp16(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
+{
+    vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
+    vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
+}
+template <>
+inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs,
+                                          const float32x4_t &vec_mean,
+                                          const float32x4_t &vec_multip,
+                                          const float32x4_t &vec_beta)
+{
+    const auto input_low  = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
+    const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
+    const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
+    const auto result_high =
+        wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
+    float16x8_t result = wrapper::vcombine(result_low, result_high);
+
+    return result;
+}
+
+template <typename AccType>
+void instance_normalization_nchw_fp16(
+    const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
+
+    // Clear X/Y dimensions on execution window as we handle the planes manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    constexpr int      window_step_x  = 16 / sizeof(float16_t);
+    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+
+    Iterator input_it(input, win);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
+
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
+
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec =
+                            vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
+        },
+        input_it);
+}
+} // namespace
+
+void neon_fp16_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
+{
+    if (use_mixed_precision)
+    {
+        return instance_normalization_nchw_fp16<float>(input, output, gamma, beta, epsilon, window);
+    }
+    else
+    {
+        return instance_normalization_nchw_fp16<float16_t>(input, output, gamma, beta, epsilon, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..e1ca05518d
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
+{
+    ARM_COMPUTE_UNUSED(use_mixed_precision);
+    return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
new file mode 100644
index 0000000000..515079e1b5
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+template <typename InputType, typename AccType>
+void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs)
+{
+    result        = wrapper::vadd(result, inputs);
+    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
+}
+
+template <typename InputType, typename AccType>
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+{
+    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
+}
+
+template <typename T, typename AccType>
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    // Clear X/Y dimensions on execution window as we handle the planes manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    constexpr int      window_step_x  = 16 / sizeof(T);
+    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+
+    Iterator input_it(input, win);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
+
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
+
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val        = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
+        },
+        input_it);
+}
+
+template void instance_normalization_nchw<float>(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h
new file mode 100644
index 0000000000..e1cc7487f7
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
+#include "arm_compute/core/Helpers.h"
+
+#include "arm_neon.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T, typename AccType = T>
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+
+template <typename InputType, typename AccType = InputType>
+void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs);
+
+template <typename InputType, typename AccType = InputType>
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h
new file mode 100644
index 0000000000..51b496c41d
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/list.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H
+#define SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_INSTANCENORM_KERNEL(func_name)                                                                        \
+    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \
+                   const Window &window)
+DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm);
+DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm);
+#undef DECLARE_INSTANCENORM_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
new file mode 100644
index 0000000000..296fe88791
--- /dev/null
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "src/core/utils/AssemblyUtils.h"
+
+#include "depthwise_common.hpp"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+constexpr unsigned int idx_width    = 1;
+constexpr unsigned int idx_height   = 2;
+constexpr unsigned int idx_channels = 0;
+constexpr unsigned int idx_batches  = 3;
+
+template <typename TSrc, typename TWeights, typename TDst>
+void create_arm_dwc(const ITensorInfo                                      *src,
+                    const ITensorInfo                                      *weights,
+                    ITensorInfo                                            *dst,
+                    const ConvolutionInfo                                  &info,
+                    const CPUInfo                                          &cpu_info,
+                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
+                    std::string                                            &_name)
+{
+    unsigned int stride_cols{};
+    unsigned int stride_rows{};
+    std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
+
+    unsigned int dilation_cols = info.dilation.x();
+    unsigned int dilation_rows = info.dilation.y();
+
+    const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    const unsigned int kernel_cols = weights->dimension(idx_width);
+    const unsigned int kernel_rows = weights->dimension(idx_height);
+
+    const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
+
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
+
+    // Configure assembly pooling kernel
+    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
+    if (dwc_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    _name  = dwc_kernel_asm->name();
+    kernel = std::move(dwc_kernel_asm);
+}
+
+template <typename TSrc, typename TWeights, typename TDst>
+void create_arm_dwc_quant(const ITensorInfo                                      *src,
+                          const ITensorInfo                                      *weights,
+                          ITensorInfo                                            *dst,
+                          const ConvolutionInfo                                  &info,
+                          const CPUInfo                                          &cpu_info,
+                          std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
+                          std::vector<int32_t>                                   &multipliers,
+                          std::vector<int32_t>                                   &right_shifts,
+                          std::vector<int32_t>                                   &left_shifts,
+                          std::string                                            &_name)
+{
+    unsigned int stride_cols{};
+    unsigned int stride_rows{};
+    std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
+
+    unsigned int dilation_cols = info.dilation.x();
+    unsigned int dilation_rows = info.dilation.y();
+
+    const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    const unsigned int kernel_cols = weights->dimension(idx_width);
+    const unsigned int kernel_rows = weights->dimension(idx_height);
+
+    const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
+
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
+
+    const auto src_qinfo     = src->quantization_info().uniform();
+    const auto weights_qinfo = weights->quantization_info();
+    const auto dst_qinfo     = dst->quantization_info().uniform();
+
+    const unsigned int num_filters = weights_qinfo.scale().size();
+
+    multipliers.resize(num_filters);
+    std::vector<int32_t> dst_shifts(num_filters);
+    quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data());
+
+    // Quantize activation bounds
+    int32_t min_activation = std::numeric_limits<TSrc>::lowest();
+    int32_t max_activation = std::numeric_limits<TSrc>::max();
+    if (info.act_info.enabled())
+    {
+        std::tie(min_activation, max_activation) =
+            get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
+    }
+
+    // Set quantization parameters for assembly kernels
+    arm_gemm::Requantize32 requant_args{};
+    if (is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        left_shifts.resize(num_filters);
+        right_shifts.resize(num_filters);
+        bool need_left_shift = false; // Select more optimized path if left shift is not needed
+        for (unsigned int i = 0; i < num_filters; ++i)
+        {
+            left_shifts[i]  = std::max(-dst_shifts[i], static_cast<int32_t>(0));
+            right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
+            if (dst_shifts[i] < 0 && !need_left_shift)
+            {
+                need_left_shift = true;
+            }
+        }
+
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr,
+                                              right_shifts.data(), multipliers.data(),
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
+    }
+    else
+    {
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, -dst_shifts[0], multipliers[0],
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
+    }
+
+    // Configure assembly pooling kernel with requantization
+    auto dwc_kernel_asm =
+        arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
+    if (dwc_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+    _name  = dwc_kernel_asm->name();
+    kernel = std::move(dwc_kernel_asm);
+}
+} // namespace
+
+CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
+    : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name()
+{
+}
+
+CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
+
+void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
+                                                        const ITensorInfo *weights,
+                                                        const ITensorInfo *,
+                                                        ITensorInfo           *dst,
+                                                        const ConvolutionInfo &info,
+                                                        const CPUInfo         &cpu_info)
+{
+    ARM_COMPUTE_UNUSED(cpu_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    // Destination initialization if not yet initialized
+    const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+    _name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
+    std::string asm_kernel_name("");
+#if defined(__aarch64__)
+    switch (src->data_type())
+    {
+        case DataType::QASYMM8:
+            if (is_data_type_quantized_per_channel(weights->data_type()))
+            {
+                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                               _multipliers, _right_shifts, _left_shifts,
+                                                               asm_kernel_name);
+            }
+            else
+            {
+                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                                _multipliers, _right_shifts, _left_shifts,
+                                                                asm_kernel_name);
+            }
+            break;
+        case DataType::QASYMM8_SIGNED:
+            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers,
+                                                         _right_shifts, _left_shifts, asm_kernel_name);
+            break;
+#if defined(ENABLE_FP16_KERNELS)
+        case DataType::F16:
+            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                            asm_kernel_name);
+            break;
+#endif // defined(ENABLE_FP16_KERNELS)
+        case DataType::F32:
+            create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
+            break;
+        default:
+            break;
+    }
+#endif // defined(__aarch64__)
+
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+    if (_kernel_asm != nullptr)
+    {
+        _name += "/" + asm_kernel_name;
+    }
+}
+
+Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo     *src,
+                                                         const ITensorInfo     *weights,
+                                                         const ITensorInfo     *bias,
+                                                         const ITensorInfo     *dst,
+                                                         const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+#if !defined(__aarch64__)
+    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
+#endif // !defined(__aarch64__)
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC,
+                                    "Only NHWC is supported by assembly kernels");
+
+    if (is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
+
+        if (is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        }
+    }
+
+    if (dst->total_size() > 0)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    // Assembly kernels cannot work with padding greater than the kernel.
+    const auto &padding   = info.pad_stride_info;
+    const auto &dilation  = info.dilation;
+    const auto &wei_shape = weights->tensor_shape();
+
+    const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1);
+    const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
+                                padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h);
+
+    return Status{};
+}
+
+void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
+    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+    ITensor       *storage   = tensors.get_tensor(TensorType::ACL_INT_1);
+
+    const auto src_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       dst_ptr        = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space  = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+    auto       parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
+
+    const auto src_shape   = src->info()->tensor_shape();
+    const auto dst_shape   = dst->info()->tensor_shape();
+    const auto src_padding = src->info()->padding();
+    const auto dst_padding = dst->info()->padding();
+
+    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
+    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
+    const size_t ld_src_batch = ld_src_row * src_shape[2];
+    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
+    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
+    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
+
+    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row,
+                         ld_dst_batch, working_space, info.thread_id, info.num_threads);
+}
+
+void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(
+    void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
+{
+    _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
+}
+
+size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const
+{
+    return _kernel_asm->get_storage_size();
+}
+
+size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+{
+    return _kernel_asm->get_working_size(num_threads);
+}
+
+bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const
+{
+    return _kernel_asm != nullptr;
+}
+
+const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const
+{
+    return _name.c_str();
+}
+
+size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+
+    return ICPPKernel::default_mws;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
new file mode 100644
index 0000000000..fadaefb999
--- /dev/null
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+// Forward declarations
+class IDepthwiseCommon;
+} // namespace depthwise
+} // namespace arm_conv
+
+namespace arm_compute
+{
+struct ConvolutionInfo;
+
+namespace cpu
+{
+namespace kernels
+{
+/** This class is a wrapper for the depthwise convolution assembly kernels.  */
+class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel<CpuDepthwiseConv2dAssemblyWrapperKernel>
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2dAssemblyWrapperKernel();
+    ~CpuDepthwiseConv2dAssemblyWrapperKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyWrapperKernel);
+
+    /** Initialise the kernel's src and dst.
+     *
+     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights  Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                      Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  bias     Bias tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                      Data type supported: same as @p src, S32 when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst      Destination tensor info. Data type supported: same as @p input.
+     * @param[in]  info     Depthwise convolution layer meta-data.
+     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
+     */
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info,
+                   const CPUInfo         &cpu_info);
+
+    /** Indicates whether or not this function can be used to process the given parameters.
+     *
+     * Similar to @ref CpuDepthwiseConv2dAssemblyWrapperKernel::configure()
+     *
+     * @return a status.
+     */
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Pack bias and weights in a storage space for the assembly kernel
+     *
+     * @param[in] parameters_ptr Pointer to storage space.
+     * @param[in] bias_ptr       Pointer to bias buffer.
+     * @param[in] weights_ptr    Pointer to weights buffer.
+     * @param[in] ld_weights_col Columns displacement for the weights tensor.
+     * @param[in] ld_weights_row Rows displacement for the weights tensor.
+     */
+    void pack_parameters(
+        void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
+
+    /** Get the amount of storage space required for the rearranged weights and bias.
+     *
+     * @return size of workspace
+     */
+    size_t get_storage_size() const;
+
+    /** Get size of the workspace needed by the assembly kernel.
+     *
+     * @param[in] num_threads Maximum number of threads that are going to be spawned.
+     *
+     * @return size of workspace
+     */
+    size_t get_working_size(unsigned int num_threads) const;
+
+    /** Was the asm kernel successfully configured?
+     *
+     * @return True if the asm kernel is configured and ready to run
+     */
+    bool is_configured() const;
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+private:
+    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> _kernel_asm;
+    std::vector<int32_t>                                   _multipliers{};
+    std::vector<int32_t>                                   _left_shifts{};
+    std::vector<int32_t>                                   _right_shifts{};
+    std::string                                            _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
new file mode 100644
index 0000000000..2c1cb15786
--- /dev/null
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const PoolingLayerInfo &info,
+                                               const CPUInfo          &cpu_info)
+{
+    ARM_COMPUTE_UNUSED(cpu_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // dst initialization if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
+
+#if defined(__aarch64__)
+    const bool requantize = src->quantization_info() != dst->quantization_info();
+
+    switch (src->data_type())
+    {
+        case DataType::QASYMM8:
+            if (requantize)
+            {
+                create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
+            }
+            else
+            {
+                create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
+            }
+            break;
+        case DataType::QASYMM8_SIGNED:
+            if (requantize)
+            {
+                create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
+            }
+            else
+            {
+                create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
+            }
+            break;
+#if defined(ENABLE_FP16_KERNELS)
+        case DataType::F16:
+            create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
+            break;
+#endif // defined(ENABLE_FP16_KERNELS)
+        case DataType::F32:
+            create_arm_pooling<float, float>(src, dst, info, cpu_info);
+            break;
+        default:
+            break;
+    }
+#endif // defined(__aarch64__)
+
+    Window win = calculate_max_window(*dst, Steps());
+    INEKernel::configure(win);
+}
+
+Status
+CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+#ifndef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
+#endif /* __aarch64__ */
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC),
+                                    "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
+                                    "Only AVG and MAX pooling are supported by assembly kernels");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(info),
+        "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
+
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+        const TensorInfo out_info(compute_pool_shape(*src, info), 1, dst->data_type());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+        const auto src_qinfo = src->quantization_info().uniform();
+        const auto dst_qinfo = dst->quantization_info().uniform();
+
+        if (src_qinfo != dst_qinfo)
+        {
+            const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+            int32_t     dst_multiplier{};
+            int32_t     dst_shift{};
+            ARM_COMPUTE_RETURN_ERROR_ON(
+                quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+        }
+        else
+        {
+            if (src->data_type() == DataType::QASYMM8)
+            {
+                const bool has_padding = info.pad_stride_info.has_padding();
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !info.exclude_padding && has_padding,
+                    "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+            }
+        }
+    }
+    else
+    {
+        if (src->data_type() == DataType::QASYMM8)
+        {
+            // If dst is not configured, the quantization info are the same
+            const bool has_padding = info.pad_stride_info.has_padding();
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !info.exclude_padding && has_padding,
+                "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+        }
+    }
+    return Status{};
+}
+
+void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
+    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+
+    const auto in_ptr  = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space =
+        (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+
+    const auto src_shape   = src->info()->tensor_shape();
+    const auto dst_shape   = dst->info()->tensor_shape();
+    const auto src_padding = src->info()->padding();
+    const auto dst_padding = dst->info()->padding();
+
+    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
+    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
+    const size_t ld_src_batch = ld_src_row * src_shape[2];
+    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
+    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
+    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
+
+    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+                         working_space, info.thread_id, info.num_threads);
+}
+
+size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+{
+    return _kernel_asm->get_working_size(num_threads);
+}
+
+bool CpuPool2dAssemblyWrapperKernel::is_configured() const
+{
+    return _kernel_asm != nullptr;
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo      *src,
+                                                        ITensorInfo            *dst,
+                                                        const PoolingLayerInfo &info,
+                                                        const CPUInfo          &cpu_info)
+{
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
+
+    arm_conv::pooling::PoolingWindow window{};
+    window.cols = static_cast<unsigned int>(info.pool_size.x());
+    window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+    arm_conv::pooling::PoolingStride stride{};
+    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
+
+    constexpr unsigned int idx_width    = 1;
+    constexpr unsigned int idx_height   = 2;
+    constexpr unsigned int idx_channels = 0;
+    constexpr unsigned int idx_batches  = 3;
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+    // Configure assembly pooling kernel
+    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
+    if (pooling_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    _kernel_asm = std::move(pooling_kernel_asm);
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo      *src,
+                                                                ITensorInfo            *dst,
+                                                                const PoolingLayerInfo &info,
+                                                                const CPUInfo          &cpu_info)
+{
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
+
+    arm_conv::pooling::PoolingWindow window{};
+    window.cols = static_cast<unsigned int>(info.pool_size.x());
+    window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+    arm_conv::pooling::PoolingStride stride{};
+    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
+
+    constexpr unsigned int idx_width    = 1;
+    constexpr unsigned int idx_height   = 2;
+    constexpr unsigned int idx_channels = 0;
+    constexpr unsigned int idx_batches  = 3;
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+    const auto src_qinfo = src->quantization_info().uniform();
+    const auto dst_qinfo = dst->quantization_info().uniform();
+
+    const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+    int32_t     dst_multiplier{};
+    int32_t     dst_shift{};
+    quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
+
+    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset,
+                                                       dst_shift, // left shift
+                                                       0,         // right shift
+                                                       dst_multiplier);
+
+    // Configure assembly pooling kernel with requantization
+    auto pooling_kernel_asm =
+        arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+    if (pooling_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    _kernel_asm = std::move(pooling_kernel_asm);
+}
+
+size_t CpuPool2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+
+    return ICPPKernel::default_mws;
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
new file mode 100644
index 0000000000..b4ff1e6f2d
--- /dev/null
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
+
+#include "pool_common.hpp"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** This class is a wrapper for the assembly kernels.
+  *
+  * Some kernels were written in assembly and highly optimised for specific
+  * CPUs like A53 or A55. The arm compute library creates an instance of
+  * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
+  * execute a single assembly kernel in the context of an NEFunction.
+  *
+  */
+class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel<CpuPool2dAssemblyWrapperKernel>
+{
+public:
+    /** Constructor
+     */
+    CpuPool2dAssemblyWrapperKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
+
+    const char *name() const override
+    {
+        return "CpuPool2dAssemblyWrapperKernel";
+    }
+
+    /** Initialise the kernel's src and dst.
+     *
+     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst      Destination tensor info to store the result of pooling. Data types supported: same as @p src.
+     * @param[in]  info     Pooling meta-data.
+     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool2dAssemblyWrapperKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+    /** Get size of the workspace needed by the assembly kernel.
+     *
+     * @param[in] num_threads Maximum number of threads that are going to be spawned.
+     *
+     * @return size of workspace
+     */
+    size_t get_working_size(unsigned int num_threads) const;
+
+    /** Was the asm kernel successfully configured?
+     *
+     * @return True if the asm kernel is configured and ready to run
+     */
+    bool is_configured() const;
+
+private:
+    /** Helper function to create the assembly kernel.
+     *
+     * @param[in] src  Source tensor info.
+     * @param[in] dst  Destination tensor info.
+     * @param[in] info Pooling layer meta-data.
+     */
+    template <typename Typesrc, typename Typedst>
+    void
+    create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    /** Helper function to create the assembly kernel with requantization support
+     *
+     * @param[in] src  Source tensor info.
+     * @param[in] dst  Destination tensor info.
+     * @param[in] info Pooling layer meta-data.
+     */
+    template <typename Typesrc, typename Typedst>
+    void create_arm_pooling_requant(const ITensorInfo      *src,
+                                    ITensorInfo            *dst,
+                                    const PoolingLayerInfo &info,
+                                    const CPUInfo          &cpu_info);
+
+    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{nullptr};
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..6c6527de06
--- /dev/null
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+{
+    ARM_COMPUTE_UNUSED(unused_axis);
+    return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window);
+}
+
+void neon_fp16_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+{
+    return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..520877068c
--- /dev/null
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+{
+    ARM_COMPUTE_UNUSED(unused_axis);
+    return l2_normalize_x<float, 4>(in, sum, out, epsilon, window);
+}
+
+void neon_fp32_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+{
+    return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
new file mode 100644
index 0000000000..6bd19299b7
--- /dev/null
+++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
+#define SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T, int S>
+void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input_it(in, win_collapsed);
+    Iterator sum_it(sum, win_collapsed);
+    Iterator output_it(out, win_collapsed);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
+            const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+            const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
+
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                out_ptr[x] = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
+}
+
+template <typename T, int S>
+void l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window window_sum(win);
+    window_sum.set(axis, Window::Dimension(0, 0, 0));
+
+    Iterator input_it(in, win);
+    Iterator sum_it(sum, window_sum);
+    Iterator output_it(out, win);
+
+    const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+                out_ptr[x]         = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h
new file mode 100644
index 0000000000..e2a879d06e
--- /dev/null
+++ b/src/cpu/kernels/l2normlayer/list.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
+#define SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_L2NORMLAYER_KERNEL(func_name)                                                                \
+    void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \
+                   size_t axis)
+
+DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x);
+DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz);
+DECLARE_L2NORMLAYER_KERNEL(neon_fp32_l2_normalize_x);
+DECLARE_L2NORMLAYER_KERNEL(neon_fp32_l2_normalize_yz);
+
+#undef DECLARE_L2NORMLAYER_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp
new file mode 100644
index 0000000000..5516f5b33d
--- /dev/null
+++ b/src/cpu/kernels/lut/generic/neon/u8.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/lut/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#ifdef __aarch64__
+
+void lut_u8_neon(
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
+{
+    __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n"
+                         "ldr q17, [%x[table], #0x10]\n"
+                         "mov x23, #0x0\n"
+                         "ldr q18, [%x[table], #0x20]\n"
+                         "ldr q19, [%x[table], #0x30]\n"
+                         "ldr q20, [%x[table], #0x40]\n"
+                         "ldr q21, [%x[table], #0x50]\n"
+                         "ldr q22, [%x[table], #0x60]\n"
+                         "ldr q23, [%x[table], #0x70]\n"
+                         "ldr q24, [%x[table], #0x80]\n"
+                         "ldr q25, [%x[table], #0x90]\n"
+                         "ldr q26, [%x[table], #0xa0]\n"
+                         "ldr q27, [%x[table], #0xb0]\n"
+                         "ldr q28, [%x[table], #0xc0]\n"
+                         "ldr q29, [%x[table], #0xd0]\n"
+                         "ldr q30, [%x[table], #0xe0]\n"
+                         "ldr q31, [%x[table], #0xf0]\n"
+                         "1:" // string loop
+                         "ldr x22, [%x[input], x23, LSL #0x3]\n"
+                         "ldr x21, [%x[output], x23, LSL #0x3]\n"
+                         "movi v11.16b, #0x40\n"
+                         "movi v10.16b, #0x80\n"
+                         "movi v9.16b, #0xc0\n"
+                         "mov x20, %x[string_length]\n"
+                         "2:" // 4 rounds: width loop
+                         "cmp x20, #0x30\n"
+                         "bge 27f\n"
+                         "tbz x20, #5, 10f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "ld1 { v13.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 6f\n"
+                         "ldr d12, [x22], #0x8\n"
+                         "tbz x20, #2, 4f\n"
+                         "ld1 { v12.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 3f\n"
+                         "ld1 { v12.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "3:" // 4 rounds: Partial load: partial_1_44
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "4:" // 4 rounds: Partial load: partial_2_40
+                         "tbz x20, #1, 5f\n"
+                         "ld1 { v12.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "5:" // 4 rounds: Partial load: partial_1_40
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "6:" // 4 rounds: Partial load: partial_4_32
+                         "tbz x20, #2, 8f\n"
+                         "ldr s12, [x22], #0x4\n"
+                         "tbz x20, #1, 7f\n"
+                         "ld1 { v12.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "7:" // 4 rounds: Partial load: partial_1_36
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "8:" // 4 rounds: Partial load: partial_2_32
+                         "tbz x20, #1, 9f\n"
+                         "ldr h12, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "9:" // 4 rounds: Partial load: partial_1_32
+                         "tbz x20, #0, 26f\n"
+                         "ldr b12, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "10:" // 4 rounds: Partial load: partial_16_0
+                         "tbz x20, #4, 18f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 14f\n"
+                         "ldr d13, [x22], #0x8\n"
+                         "tbz x20, #2, 12f\n"
+                         "ld1 { v13.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 11f\n"
+                         "ld1 { v13.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "11:" // 4 rounds: Partial load: partial_1_28
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "12:" // 4 rounds: Partial load: partial_2_24
+                         "tbz x20, #1, 13f\n"
+                         "ld1 { v13.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "13:" // 4 rounds: Partial load: partial_1_24
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "14:" // 4 rounds: Partial load: partial_4_16
+                         "tbz x20, #2, 16f\n"
+                         "ldr s13, [x22], #0x4\n"
+                         "tbz x20, #1, 15f\n"
+                         "ld1 { v13.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "15:" // 4 rounds: Partial load: partial_1_20
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "16:" // 4 rounds: Partial load: partial_2_16
+                         "tbz x20, #1, 17f\n"
+                         "ldr h13, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "17:" // 4 rounds: Partial load: partial_1_16
+                         "tbz x20, #0, 26f\n"
+                         "ldr b13, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "18:" // 4 rounds: Partial load: partial_8_0
+                         "tbz x20, #3, 22f\n"
+                         "ldr d8, [x22], #0x8\n"
+                         "tbz x20, #2, 20f\n"
+                         "ld1 { v8.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 19f\n"
+                         "ld1 { v8.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "19:" // 4 rounds: Partial load: partial_1_12
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "20:" // 4 rounds: Partial load: partial_2_8
+                         "tbz x20, #1, 21f\n"
+                         "ld1 { v8.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "21:" // 4 rounds: Partial load: partial_1_8
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "22:" // 4 rounds: Partial load: partial_4_0
+                         "tbz x20, #2, 24f\n"
+                         "ldr s8, [x22], #0x4\n"
+                         "tbz x20, #1, 23f\n"
+                         "ld1 { v8.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "23:" // 4 rounds: Partial load: partial_1_4
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "24:" // 4 rounds: Partial load: partial_2_0
+                         "tbz x20, #1, 25f\n"
+                         "ldr h8, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "25:" // 4 rounds: Partial load: partial_1_0
+                         "ldr b8, [x22, #0x0]\n"
+                         "26:" // 4 rounds: Partial load: Done
+                         "b 28f\n"
+                         "27:" // 4 rounds: Full load
+                         "ldr q8, [x22, #0x0]\n"
+                         "ldr q13, [x22, #0x10]\n"
+                         "ldr q12, [x22, #0x20]\n"
+                         "add x22, x22, #0x30\n"
+                         "28:" // 4 rounds: Load done
+                         "sub v0.16b, v8.16b, v11.16b\n"
+                         "sub v7.16b, v8.16b, v10.16b\n"
+                         "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
+                         "sub v6.16b, v8.16b, v9.16b\n"
+                         "sub v5.16b, v13.16b, v11.16b\n"
+                         "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
+                         "sub v4.16b, v13.16b, v10.16b\n"
+                         "sub v3.16b, v13.16b, v9.16b\n"
+                         "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
+                         "sub v2.16b, v12.16b, v11.16b\n"
+                         "sub v1.16b, v12.16b, v10.16b\n"
+                         "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
+                         "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
+                         "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
+                         "orr v8.16b, v8.16b, v0.16b\n"
+                         "sub v0.16b, v12.16b, v9.16b\n"
+                         "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
+                         "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
+                         "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
+                         "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
+                         "orr v7.16b, v7.16b, v6.16b\n"
+                         "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
+                         "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
+                         "orr v13.16b, v13.16b, v5.16b\n"
+                         "orr v4.16b, v4.16b, v3.16b\n"
+                         "orr v12.16b, v12.16b, v2.16b\n"
+                         "cmp x20, #0x30\n"
+                         "orr v1.16b, v1.16b, v0.16b\n"
+                         "orr v8.16b, v8.16b, v7.16b\n"
+                         "orr v13.16b, v13.16b, v4.16b\n"
+                         "orr v12.16b, v12.16b, v1.16b\n"
+                         "bge 53f\n"
+                         "tbz x20, #5, 36f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "st1 { v13.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 32f\n"
+                         "str d12, [x21], #0x8\n"
+                         "tbz x20, #2, 30f\n"
+                         "st1 { v12.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 29f\n"
+                         "st1 { v12.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "29:" // 4 rounds: Partial writeback: partial_1_44
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "30:" // 4 rounds: Partial writeback: partial_2_40
+                         "tbz x20, #1, 31f\n"
+                         "st1 { v12.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "31:" // 4 rounds: Partial writeback: partial_1_40
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "32:" // 4 rounds: Partial writeback: partial_4_32
+                         "tbz x20, #2, 34f\n"
+                         "str s12, [x21], #0x4\n"
+                         "tbz x20, #1, 33f\n"
+                         "st1 { v12.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "33:" // 4 rounds: Partial writeback: partial_1_36
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "34:" // 4 rounds: Partial writeback: partial_2_32
+                         "tbz x20, #1, 35f\n"
+                         "str h12, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "35:" // 4 rounds: Partial writeback: partial_1_32
+                         "tbz x20, #0, 52f\n"
+                         "str b12, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "36:" // 4 rounds: Partial writeback: partial_16_0
+                         "tbz x20, #4, 44f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 40f\n"
+                         "str d13, [x21], #0x8\n"
+                         "tbz x20, #2, 38f\n"
+                         "st1 { v13.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 37f\n"
+                         "st1 { v13.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "37:" // 4 rounds: Partial writeback: partial_1_28
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "38:" // 4 rounds: Partial writeback: partial_2_24
+                         "tbz x20, #1, 39f\n"
+                         "st1 { v13.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "39:" // 4 rounds: Partial writeback: partial_1_24
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "40:" // 4 rounds: Partial writeback: partial_4_16
+                         "tbz x20, #2, 42f\n"
+                         "str s13, [x21], #0x4\n"
+                         "tbz x20, #1, 41f\n"
+                         "st1 { v13.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "41:" // 4 rounds: Partial writeback: partial_1_20
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "42:" // 4 rounds: Partial writeback: partial_2_16
+                         "tbz x20, #1, 43f\n"
+                         "str h13, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "43:" // 4 rounds: Partial writeback: partial_1_16
+                         "tbz x20, #0, 52f\n"
+                         "str b13, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "44:" // 4 rounds: Partial writeback: partial_8_0
+                         "tbz x20, #3, 48f\n"
+                         "str d8, [x21], #0x8\n"
+                         "tbz x20, #2, 46f\n"
+                         "st1 { v8.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 45f\n"
+                         "st1 { v8.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "45:" // 4 rounds: Partial writeback: partial_1_12
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "46:" // 4 rounds: Partial writeback: partial_2_8
+                         "tbz x20, #1, 47f\n"
+                         "st1 { v8.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "47:" // 4 rounds: Partial writeback: partial_1_8
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "48:" // 4 rounds: Partial writeback: partial_4_0
+                         "tbz x20, #2, 50f\n"
+                         "str s8, [x21], #0x4\n"
+                         "tbz x20, #1, 49f\n"
+                         "st1 { v8.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "49:" // 4 rounds: Partial writeback: partial_1_4
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "50:" // 4 rounds: Partial writeback: partial_2_0
+                         "tbz x20, #1, 51f\n"
+                         "str h8, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "51:" // 4 rounds: Partial writeback: partial_1_0
+                         "str b8, [x21, #0x0]\n"
+                         "52:" // 4 rounds: Partial writeback: Done
+                         "b 54f\n"
+                         "53:" // 4 rounds: Full writeback
+                         "str q8, [x21, #0x0]\n"
+                         "str q13, [x21, #0x10]\n"
+                         "str q12, [x21, #0x20]\n"
+                         "add x21, x21, #0x30\n"
+                         "54:" // 4 rounds: Writeback done
+                         "subs x20, x20, #0x30\n"
+                         "bgt 2b\n"
+                         "add x23, x23, #0x1\n"
+                         "cmp x23, %x[num_strings]\n"
+                         "bne 1b\n"
+                         :
+                         : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output),
+                           [string_length] "r"(string_length), [table] "r"(table)
+                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+                           "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+                           "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
+}
+
+#endif // __aarch64__
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/lut/generic/sve/u16.cpp b/src/cpu/kernels/lut/generic/sve/u16.cpp
new file mode 100644
index 0000000000..75b8dcaae2
--- /dev/null
+++ b/src/cpu/kernels/lut/generic/sve/u16.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void lut_u16_sve(const uint16_t *table, size_t num_strings, size_t size, const uint16_t *input, uint16_t *output)
+{
+    int64_t cnth  = svcnth();
+    int64_t tail  = size & (4 * cnth - 1);
+    int64_t count = size - tail;
+    int64_t pos   = 0;
+    ARM_COMPUTE_UNUSED(num_strings);
+    __asm __volatile("cbz %[count], 2f\n"
+                     "mov z31.s, #0\n"
+                     "cnth x7, ALL, MUL #4\n"
+                     "cntb x8, ALL, MUL #4\n"
+                     "ptrue p0.b\n"
+                     "1:"
+                     "ld1h z0.h, p0/z, [%[input]]\n"
+                     "ld1h z1.h, p0/z, [%[input], #1, MUL VL]\n"
+                     "ld1h z2.h, p0/z, [%[input], #2, MUL VL]\n"
+                     "ld1h z3.h, p0/z, [%[input], #3, MUL VL]\n"
+                     "add %[input], %[input], x8\n"
+
+                     "zip1 z8.h, z0.h, z31.h\n"
+                     "ld1h z8.s, p0/z, [%[table], z8.s, UXTW #1]\n"
+                     "zip2 z0.h, z0.h, z31.h\n"
+                     "ld1h z0.s, p0/z, [%[table], z0.s, UXTW #1]\n"
+                     "uzp1 z0.h, z8.h, z0.h\n"
+                     "st1h z0.h, p0, [%[output]]\n"
+
+                     "zip1 z10.h, z1.h, z31.h\n"
+                     "ld1h z10.s, p0/z, [%[table], z10.s, UXTW #1]\n"
+                     "zip2 z1.h, z1.h, z31.h\n"
+                     "ld1h z1.s, p0/z, [%[table], z1.s, UXTW #1]\n"
+                     "uzp1 z1.h, z10.h, z1.h\n"
+                     "st1h z1.h, p0, [%[output], #1, MUL VL]\n"
+
+                     "zip1 z12.h, z2.h, z31.h\n"
+                     "ld1h z12.s, p0/z, [%[table], z12.s, UXTW #1]\n"
+                     "zip2 z2.h, z2.h, z31.h\n"
+                     "ld1h z2.s, p0/z, [%[table], z2.s, UXTW #1]\n"
+                     "uzp1 z2.h, z12.h, z2.h\n"
+                     "st1h z2.h, p0, [%[output], #2, MUL VL]\n"
+
+                     "zip1 z14.h, z3.h, z31.h\n"
+                     "ld1h z14.s, p0/z, [%[table], z14.s, UXTW #1]\n"
+                     "zip2 z3.h, z3.h, z31.h\n"
+                     "ld1h z3.s, p0/z, [%[table], z3.s, UXTW #1]\n"
+                     "uzp1 z3.h, z14.h, z3.h\n"
+                     "st1h z3.h, p0, [%[output], #3, MUL VL]\n"
+
+                     "add %[pos], %[pos], x7\n"
+                     "add %[output], %[output], x8\n"
+                     "cmp %[pos], %[count]\n"
+                     "blt 1b\n"
+                     "2:\n"
+                     : [count] "+r"(count), [input] "+r"(input), [output] "+r"(output), [pos] "+r"(pos)
+                     : [table] "r"(table)
+                     : "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12",
+                       "z14", "z31", "p0", "p1", "z2", "z3", "z4", "x7", "x8");
+    for (int i = 0; i < tail; i++)
+    {
+        output[i] = table[input[i]];
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __aarch64__
diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp
new file mode 100644
index 0000000000..ee8572703e
--- /dev/null
+++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/lut/list.h"
+
+#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+namespace arm_compute
+{
+namespace cpu
+{
+void lut_u8_sve2(
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
+{
+    __asm__ __volatile__(
+        "ptrue p0.b\n"
+        "cntd x25\n"
+        "addvl %x[table], %x[table], #8\n"
+        "ld1b { z16.b }, p0/Z, [%x[table], #-8, MUL VL]\n"
+        "tbnz x25, #5, 1f\n"
+        "ld1b { z17.b }, p0/Z, [%x[table], #-7, MUL VL]\n"
+        "tbnz x25, #4, 1f\n"
+        "ld1b { z18.b }, p0/Z, [%x[table], #-6, MUL VL]\n"
+        "ld1b { z19.b }, p0/Z, [%x[table], #-5, MUL VL]\n"
+        "tbnz x25, #3, 1f\n"
+        "ld1b { z20.b }, p0/Z, [%x[table], #-4, MUL VL]\n"
+        "ld1b { z21.b }, p0/Z, [%x[table], #-3, MUL VL]\n"
+        "ld1b { z22.b }, p0/Z, [%x[table], #-2, MUL VL]\n"
+        "ld1b { z23.b }, p0/Z, [%x[table], #-1, MUL VL]\n"
+        "tbnz x25, #2, 1f\n"
+        "ld1b { z24.b }, p0/Z, [%x[table]]\n"
+        "ld1b { z25.b }, p0/Z, [%x[table], #1, MUL VL]\n"
+        "ld1b { z26.b }, p0/Z, [%x[table], #2, MUL VL]\n"
+        "ld1b { z27.b }, p0/Z, [%x[table], #3, MUL VL]\n"
+        "ld1b { z28.b }, p0/Z, [%x[table], #4, MUL VL]\n"
+        "ld1b { z29.b }, p0/Z, [%x[table], #5, MUL VL]\n"
+        "ld1b { z30.b }, p0/Z, [%x[table], #6, MUL VL]\n"
+        "ld1b { z31.b }, p0/Z, [%x[table], #7, MUL VL]\n"
+        "1:" // Table load done
+        "mov x24, #0x0\n"
+        "2:" // string loop
+        "ldr x23, [%x[input], x24, LSL #0x3]\n"
+        "ldr x22, [%x[output], x24, LSL #0x3]\n"
+        "tbnz x25, #5, 14f\n"
+        "tbnz x25, #4, 11f\n"
+        "tbnz x25, #3, 8f\n"
+        "tbnz x25, #2, 5f\n"
+        "mov z12.b, #0x10\n"
+        "mov x21, %x[string_length]\n"
+        "ptrue p5.b\n"
+        "ptrue p4.b\n"
+        "ptrue p3.b\n"
+        "ptrue p2.b\n"
+        "ptrue p1.b\n"
+        "ptrue p0.b\n"
+        "3:" // 16 rounds: width loop
+        "addvl x20, x21, #-6\n"
+        "cmp x20, XZR\n"
+        "bge 4f\n"
+        "mov x20, #0x0\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p5.b, XZR, x21\n"
+        "whilelt p4.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p3.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p2.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p1.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p0.b, x20, x21\n"
+        "4:" // 16 rounds: predicate OK
+        "ld1b { z11.b }, p5/Z, [x23]\n"
+        "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n"
+        "tbl z9.b, { z16.b }, z11.b\n"
+        "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n"
+        "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n"
+        "sub z11.b, z11.b, z12.b\n"
+        "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n"
+        "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n"
+        "tbl z4.b, { z16.b }, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        "tbl z3.b, { z16.b }, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        "tbl z2.b, { z16.b }, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        "tbl z1.b, { z16.b }, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        "tbl z0.b, { z16.b }, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e49 // tbx z9.b, z18.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e44 // tbx z4.b, z18.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e43 // tbx z3.b, z18.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e42 // tbx z2.b, z18.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e41 // tbx z1.b, z18.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e40 // tbx z0.b, z18.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e69 // tbx z9.b, z19.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e64 // tbx z4.b, z19.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e63 // tbx z3.b, z19.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e62 // tbx z2.b, z19.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e61 // tbx z1.b, z19.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e60 // tbx z0.b, z19.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e89 // tbx z9.b, z20.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e84 // tbx z4.b, z20.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e83 // tbx z3.b, z20.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e82 // tbx z2.b, z20.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e81 // tbx z1.b, z20.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e80 // tbx z0.b, z20.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2ea9 // tbx z9.b, z21.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2ea4 // tbx z4.b, z21.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282ea3 // tbx z3.b, z21.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272ea2 // tbx z2.b, z21.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262ea1 // tbx z1.b, z21.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252ea0 // tbx z0.b, z21.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2ec9 // tbx z9.b, z22.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2ec4 // tbx z4.b, z22.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282ec3 // tbx z3.b, z22.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272ec2 // tbx z2.b, z22.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262ec1 // tbx z1.b, z22.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252ec0 // tbx z0.b, z22.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2ee9 // tbx z9.b, z23.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2ee4 // tbx z4.b, z23.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282ee3 // tbx z3.b, z23.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272ee2 // tbx z2.b, z23.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262ee1 // tbx z1.b, z23.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252ee0 // tbx z0.b, z23.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2f09 // tbx z9.b, z24.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2f04 // tbx z4.b, z24.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282f03 // tbx z3.b, z24.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272f02 // tbx z2.b, z24.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262f01 // tbx z1.b, z24.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252f00 // tbx z0.b, z24.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2f29 // tbx z9.b, z25.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2f24 // tbx z4.b, z25.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282f23 // tbx z3.b, z25.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272f22 // tbx z2.b, z25.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262f21 // tbx z1.b, z25.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252f20 // tbx z0.b, z25.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2f49 // tbx z9.b, z26.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2f44 // tbx z4.b, z26.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282f43 // tbx z3.b, z26.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272f42 // tbx z2.b, z26.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262f41 // tbx z1.b, z26.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252f40 // tbx z0.b, z26.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2f69 // tbx z9.b, z27.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2f64 // tbx z4.b, z27.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282f63 // tbx z3.b, z27.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272f62 // tbx z2.b, z27.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262f61 // tbx z1.b, z27.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252f60 // tbx z0.b, z27.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2f89 // tbx z9.b, z28.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2f84 // tbx z4.b, z28.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282f83 // tbx z3.b, z28.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272f82 // tbx z2.b, z28.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262f81 // tbx z1.b, z28.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252f80 // tbx z0.b, z28.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2fa9 // tbx z9.b, z29.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2fa4 // tbx z4.b, z29.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282fa3 // tbx z3.b, z29.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272fa2 // tbx z2.b, z29.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262fa1 // tbx z1.b, z29.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252fa0 // tbx z0.b, z29.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        "addvl x21, x21, #-6\n"
+        ".inst 0x052b2fc9 // tbx z9.b, z30.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2fc4 // tbx z4.b, z30.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282fc3 // tbx z3.b, z30.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272fc2 // tbx z2.b, z30.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262fc1 // tbx z1.b, z30.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252fc0 // tbx z0.b, z30.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        "cmp x21, XZR\n"
+        ".inst 0x052b2fe9 // tbx z9.b, z31.b, z11.b\n"
+        ".inst 0x052a2fe4 // tbx z4.b, z31.b, z10.b\n"
+        ".inst 0x05282fe3 // tbx z3.b, z31.b, z8.b\n"
+        "st1b { z9.b }, p5, [x22]\n"
+        ".inst 0x05272fe2 // tbx z2.b, z31.b, z7.b\n"
+        ".inst 0x05262fe1 // tbx z1.b, z31.b, z6.b\n"
+        "st1b { z4.b }, p4, [x22, #1, MUL VL]\n"
+        ".inst 0x05252fe0 // tbx z0.b, z31.b, z5.b\n"
+        "st1b { z3.b }, p3, [x22, #2, MUL VL]\n"
+        "addvl x23, x23, #6\n"
+        "st1b { z2.b }, p2, [x22, #3, MUL VL]\n"
+        "st1b { z1.b }, p1, [x22, #4, MUL VL]\n"
+        "st1b { z0.b }, p0, [x22, #5, MUL VL]\n"
+        "addvl x22, x22, #6\n"
+        "bgt 3b\n"
+        "b 17f\n"
+        "5:" // 256 bits
+        "mov z12.b, #0x20\n"
+        "mov x21, %x[string_length]\n"
+        "ptrue p5.b\n"
+        "ptrue p4.b\n"
+        "ptrue p3.b\n"
+        "ptrue p2.b\n"
+        "ptrue p1.b\n"
+        "ptrue p0.b\n"
+        "6:" // 8 rounds: width loop
+        "addvl x20, x21, #-6\n"
+        "cmp x20, XZR\n"
+        "bge 7f\n"
+        "mov x20, #0x0\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p5.b, XZR, x21\n"
+        "whilelt p4.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p3.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p2.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p1.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p0.b, x20, x21\n"
+        "7:" // 8 rounds: predicate OK
+        "ld1b { z11.b }, p5/Z, [x23]\n"
+        "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n"
+        "tbl z9.b, { z16.b }, z11.b\n"
+        "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n"
+        "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n"
+        "sub z11.b, z11.b, z12.b\n"
+        "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n"
+        "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n"
+        "tbl z4.b, { z16.b }, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        "tbl z3.b, { z16.b }, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        "tbl z2.b, { z16.b }, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        "tbl z1.b, { z16.b }, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        "tbl z0.b, { z16.b }, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e49 // tbx z9.b, z18.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e44 // tbx z4.b, z18.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e43 // tbx z3.b, z18.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e42 // tbx z2.b, z18.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e41 // tbx z1.b, z18.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e40 // tbx z0.b, z18.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e69 // tbx z9.b, z19.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e64 // tbx z4.b, z19.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e63 // tbx z3.b, z19.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e62 // tbx z2.b, z19.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e61 // tbx z1.b, z19.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e60 // tbx z0.b, z19.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e89 // tbx z9.b, z20.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e84 // tbx z4.b, z20.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e83 // tbx z3.b, z20.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e82 // tbx z2.b, z20.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e81 // tbx z1.b, z20.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e80 // tbx z0.b, z20.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2ea9 // tbx z9.b, z21.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2ea4 // tbx z4.b, z21.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282ea3 // tbx z3.b, z21.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272ea2 // tbx z2.b, z21.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262ea1 // tbx z1.b, z21.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252ea0 // tbx z0.b, z21.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        "addvl x21, x21, #-6\n"
+        ".inst 0x052b2ec9 // tbx z9.b, z22.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2ec4 // tbx z4.b, z22.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282ec3 // tbx z3.b, z22.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272ec2 // tbx z2.b, z22.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262ec1 // tbx z1.b, z22.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252ec0 // tbx z0.b, z22.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        "cmp x21, XZR\n"
+        ".inst 0x052b2ee9 // tbx z9.b, z23.b, z11.b\n"
+        ".inst 0x052a2ee4 // tbx z4.b, z23.b, z10.b\n"
+        ".inst 0x05282ee3 // tbx z3.b, z23.b, z8.b\n"
+        "st1b { z9.b }, p5, [x22]\n"
+        ".inst 0x05272ee2 // tbx z2.b, z23.b, z7.b\n"
+        ".inst 0x05262ee1 // tbx z1.b, z23.b, z6.b\n"
+        "st1b { z4.b }, p4, [x22, #1, MUL VL]\n"
+        ".inst 0x05252ee0 // tbx z0.b, z23.b, z5.b\n"
+        "st1b { z3.b }, p3, [x22, #2, MUL VL]\n"
+        "addvl x23, x23, #6\n"
+        "st1b { z2.b }, p2, [x22, #3, MUL VL]\n"
+        "st1b { z1.b }, p1, [x22, #4, MUL VL]\n"
+        "st1b { z0.b }, p0, [x22, #5, MUL VL]\n"
+        "addvl x22, x22, #6\n"
+        "bgt 6b\n"
+        "b 17f\n"
+        "8:" // 512 bits
+        "mov z12.b, #0x40\n"
+        "mov x21, %x[string_length]\n"
+        "ptrue p5.b\n"
+        "ptrue p4.b\n"
+        "ptrue p3.b\n"
+        "ptrue p2.b\n"
+        "ptrue p1.b\n"
+        "ptrue p0.b\n"
+        "9:" // 4 rounds: width loop
+        "addvl x20, x21, #-6\n"
+        "cmp x20, XZR\n"
+        "bge 10f\n"
+        "mov x20, #0x0\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p5.b, XZR, x21\n"
+        "whilelt p4.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p3.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p2.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p1.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p0.b, x20, x21\n"
+        "10:" // 4 rounds: predicate OK
+        "ld1b { z11.b }, p5/Z, [x23]\n"
+        "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n"
+        "tbl z9.b, { z16.b }, z11.b\n"
+        "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n"
+        "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n"
+        "sub z11.b, z11.b, z12.b\n"
+        "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n"
+        "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n"
+        "tbl z4.b, { z16.b }, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        "tbl z3.b, { z16.b }, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        "tbl z2.b, { z16.b }, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        "tbl z1.b, { z16.b }, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        "tbl z0.b, { z16.b }, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        "addvl x21, x21, #-6\n"
+        ".inst 0x052b2e49 // tbx z9.b, z18.b, z11.b\n"
+        "sub z11.b, z11.b, z12.b\n"
+        ".inst 0x052a2e44 // tbx z4.b, z18.b, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        ".inst 0x05282e43 // tbx z3.b, z18.b, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        ".inst 0x05272e42 // tbx z2.b, z18.b, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        ".inst 0x05262e41 // tbx z1.b, z18.b, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        ".inst 0x05252e40 // tbx z0.b, z18.b, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        "cmp x21, XZR\n"
+        ".inst 0x052b2e69 // tbx z9.b, z19.b, z11.b\n"
+        ".inst 0x052a2e64 // tbx z4.b, z19.b, z10.b\n"
+        ".inst 0x05282e63 // tbx z3.b, z19.b, z8.b\n"
+        "st1b { z9.b }, p5, [x22]\n"
+        ".inst 0x05272e62 // tbx z2.b, z19.b, z7.b\n"
+        ".inst 0x05262e61 // tbx z1.b, z19.b, z6.b\n"
+        "st1b { z4.b }, p4, [x22, #1, MUL VL]\n"
+        ".inst 0x05252e60 // tbx z0.b, z19.b, z5.b\n"
+        "st1b { z3.b }, p3, [x22, #2, MUL VL]\n"
+        "addvl x23, x23, #6\n"
+        "st1b { z2.b }, p2, [x22, #3, MUL VL]\n"
+        "st1b { z1.b }, p1, [x22, #4, MUL VL]\n"
+        "st1b { z0.b }, p0, [x22, #5, MUL VL]\n"
+        "addvl x22, x22, #6\n"
+        "bgt 9b\n"
+        "b 17f\n"
+        "11:" // 1024 bits
+        "mov z12.b, #0x80\n"
+        "mov x21, %x[string_length]\n"
+        "ptrue p5.b\n"
+        "ptrue p4.b\n"
+        "ptrue p3.b\n"
+        "ptrue p2.b\n"
+        "ptrue p1.b\n"
+        "ptrue p0.b\n"
+        "12:" // 2 rounds: width loop
+        "addvl x20, x21, #-6\n"
+        "cmp x20, XZR\n"
+        "bge 13f\n"
+        "mov x20, #0x0\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p5.b, XZR, x21\n"
+        "whilelt p4.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p3.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p2.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p1.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p0.b, x20, x21\n"
+        "13:" // 2 rounds: predicate OK
+        "ld1b { z11.b }, p5/Z, [x23]\n"
+        "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n"
+        "addvl x21, x21, #-6\n"
+        "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n"
+        "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n"
+        "tbl z9.b, { z16.b }, z11.b\n"
+        "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n"
+        "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n"
+        "sub z11.b, z11.b, z12.b\n"
+        "tbl z4.b, { z16.b }, z10.b\n"
+        "sub z10.b, z10.b, z12.b\n"
+        "tbl z3.b, { z16.b }, z8.b\n"
+        "sub z8.b, z8.b, z12.b\n"
+        "tbl z2.b, { z16.b }, z7.b\n"
+        "sub z7.b, z7.b, z12.b\n"
+        "tbl z1.b, { z16.b }, z6.b\n"
+        "sub z6.b, z6.b, z12.b\n"
+        "tbl z0.b, { z16.b }, z5.b\n"
+        "sub z5.b, z5.b, z12.b\n"
+        "cmp x21, XZR\n"
+        ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n"
+        ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n"
+        ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n"
+        "st1b { z9.b }, p5, [x22]\n"
+        ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n"
+        ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n"
+        "st1b { z4.b }, p4, [x22, #1, MUL VL]\n"
+        ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n"
+        "st1b { z3.b }, p3, [x22, #2, MUL VL]\n"
+        "addvl x23, x23, #6\n"
+        "st1b { z2.b }, p2, [x22, #3, MUL VL]\n"
+        "st1b { z1.b }, p1, [x22, #4, MUL VL]\n"
+        "st1b { z0.b }, p0, [x22, #5, MUL VL]\n"
+        "addvl x22, x22, #6\n"
+        "bgt 12b\n"
+        "b 17f\n"
+        "14:" // 2048 bits
+        "mov x21, %x[string_length]\n"
+        "ptrue p5.b\n"
+        "ptrue p4.b\n"
+        "ptrue p3.b\n"
+        "ptrue p2.b\n"
+        "ptrue p1.b\n"
+        "ptrue p0.b\n"
+        "15:" // 1 rounds: width loop
+        "addvl x20, x21, #-6\n"
+        "cmp x20, XZR\n"
+        "bge 16f\n"
+        "mov x20, #0x0\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p5.b, XZR, x21\n"
+        "whilelt p4.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p3.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p2.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p1.b, x20, x21\n"
+        "addvl x20, x20, #1\n"
+        "whilelt p0.b, x20, x21\n"
+        "16:" // 1 rounds: predicate OK
+        "addvl x21, x21, #-6\n"
+        "ld1b { z11.b }, p5/Z, [x23]\n"
+        "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n"
+        "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n"
+        "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n"
+        "cmp x21, XZR\n"
+        "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n"
+        "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n"
+        "tbl z9.b, { z16.b }, z11.b\n"
+        "tbl z4.b, { z16.b }, z10.b\n"
+        "tbl z3.b, { z16.b }, z8.b\n"
+        "st1b { z9.b }, p5, [x22]\n"
+        "tbl z2.b, { z16.b }, z7.b\n"
+        "tbl z1.b, { z16.b }, z6.b\n"
+        "st1b { z4.b }, p4, [x22, #1, MUL VL]\n"
+        "tbl z0.b, { z16.b }, z5.b\n"
+        "st1b { z3.b }, p3, [x22, #2, MUL VL]\n"
+        "addvl x23, x23, #6\n"
+        "st1b { z2.b }, p2, [x22, #3, MUL VL]\n"
+        "st1b { z1.b }, p1, [x22, #4, MUL VL]\n"
+        "st1b { z0.b }, p0, [x22, #5, MUL VL]\n"
+        "addvl x22, x22, #6\n"
+        "bgt 15b\n"
+        "17:" // SVE body done
+        "add x24, x24, #0x1\n"
+        "cmp x24, %x[num_strings]\n"
+        "bne 2b\n"
+        : [table] "+&r"(table)
+        : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length)
+        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1",
+          "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21",
+          "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __aarch64__
diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h
new file mode 100644
index 0000000000..9acfe97728
--- /dev/null
+++ b/src/cpu/kernels/lut/list.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_LUT_LIST_H
+#define ACL_SRC_CPU_KERNELS_LUT_LIST_H
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#ifdef __aarch64__
+#define DECLARE_LUT_U8_KERNEL(func_name)                                                                        \
+    void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \
+                   uint8_t *const *output)
+
+DECLARE_LUT_U8_KERNEL(lut_u8_neon);
+DECLARE_LUT_U8_KERNEL(lut_u8_sve2);
+
+#undef DECLARE_LUT_U8_KERNEL
+
+#define DECLARE_LUT_U16_KERNEL(func_name)                                                                  \
+    void func_name(const uint16_t *table, size_t num_strings, size_t string_length, const uint16_t *input, \
+                   uint16_t *output)
+
+DECLARE_LUT_U16_KERNEL(lut_u16_neon);
+DECLARE_LUT_U16_KERNEL(lut_u16_sve);
+
+#undef DECLARE_LUT_U16_KERNEL
+
+#endif // __aarch64__
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_LUT_LIST_H
diff --git a/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp b/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..e81ff92311
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
+{
+    return max_unpooling<float16_t>(input, indices, output, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp b/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..ba0d7851a9
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
+{
+    return max_unpooling<float>(input, indices, output, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h
new file mode 100644
index 0000000000..73a5b86a2f
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
+{
+    Iterator  input_itr(input, window);
+    Iterator  indices_itr(indices, window);
+    auto      out_ptr      = reinterpret_cast<T *>(output->buffer());
+    const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
+            auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
+            out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
+        },
+        input_itr, indices_itr);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp b/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..53e601bba6
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qs8_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
+{
+    return max_unpooling<int8_t>(input, indices, output, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..a3c346fba7
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu8_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
+{
+    return max_unpooling<uint8_t>(input, indices, output, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/list.h b/src/cpu/kernels/maxunpool/list.h
new file mode 100644
index 0000000000..2c4fe940d9
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/list.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H
+#define SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_MAXUNPOOL_KERNEL(func_name) \
+    void func_name(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
+DECLARE_MAXUNPOOL_KERNEL(neon_fp32_maxunpooling);
+DECLARE_MAXUNPOOL_KERNEL(neon_fp16_maxunpooling);
+DECLARE_MAXUNPOOL_KERNEL(neon_qs8_maxunpooling);
+DECLARE_MAXUNPOOL_KERNEL(neon_qu8_maxunpooling);
+#undef DECLARE_MAXUNPOOL_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..344b9df0c8
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <>
+void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator input_itr(input, win);
+    Iterator output_itr(output, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
+
+            float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
+            float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                sum_vec          = vaddq_f16(sum_vec, data);
+                float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
+                float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
+            }
+
+            float32x4_t sum_carry_res =
+                vpaddq_f32(vcvt_f32_f16(vget_high_f16(sum_vec)), vcvt_f32_f16(vget_low_f16(sum_vec)));
+            float sum    = vaddvq_f32(sum_carry_res);
+            float sum_sq = vaddvq_f32(sum_sq_vec);
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const float fdata = static_cast<float>(*(in_ptr + x));
+                sum += fdata;
+                sum_sq += fdata * fdata;
+            }
+
+            float16_t mean       = static_cast<float16_t>(sum / input->info()->dimension(0));
+            float     var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
+
+            float16x8_t mean_vec       = vdupq_n_f16(mean);
+            float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
+
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                float16x8_t res  = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
+                // Store results
+                vst1q_f16(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
+}
+
+void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    return mean_stddev_normalization<float16_t, 8>(input, output, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..4bff26b036
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    return mean_stddev_normalization<float, 4>(input, output, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
new file mode 100644
index 0000000000..11f6294a35
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, int size>
+void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator input_itr(input, win);
+    Iterator output_itr(output, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
+
+            auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+            auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data  = wrapper::vloadq(in_ptr + x);
+                sum_vec    = wrapper::vadd(sum_vec, data);
+                sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
+            }
+
+            auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
+            auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
+            for (int i = 0; i < size / 4; ++i)
+            {
+                sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
+                sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
+            }
+
+            auto sum    = wrapper::vgetlane(sum_carry_res, 0);
+            auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                ScalarType data = *(in_ptr + x);
+                sum += data;
+                sum_sq += data * data;
+            }
+
+            ScalarType mean       = sum / input->info()->dimension(0);
+            ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
+
+            auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
+            auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data = wrapper::vloadq(in_ptr + x);
+                auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
+                // Store results
+                wrapper::vstore(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
+}
+template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h
new file mode 100644
index 0000000000..6466506f06
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, int size>
+void mean_stddev_normalization(ITensor *_input, ITensor *_output, float _epsilon, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif //define SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..32654df5dc
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+namespace
+{
+inline float32x4_t clamp_v4f32(float32x4_t block, float32x4_t quant_min_vec, float32x4_t quant_max_vec)
+{
+    return vminq_f32(vmaxq_f32(block, quant_min_vec), quant_max_vec);
+}
+inline uint16x8_t fuse_words_f32(float32x4_t fb1, float32x4_t fb2)
+{
+    return vcombine_u16(vmovn_u32(vcvtq_u32_f32(fb1)), vmovn_u32(vcvtq_u32_f32(fb2)));
+}
+inline uint8x16_t fuse_shorts_u16(uint16x8_t sb1, uint16x8_t sb2)
+{
+    return vcombine_u8(vmovn_u16(sb1), vmovn_u16(sb2));
+}
+} // namespace
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int window_step_x  = 16;
+    const int window_start_x = static_cast<int>(window.x().start());
+    const int window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo qi_out        = output->info()->quantization_info().uniform();
+    const float                   output_scale  = qi_out.scale;
+    const int                     output_offset = qi_out.offset;
+
+    Iterator input_itr(input, win);
+    Iterator output_itr(output, win);
+
+    const float       output_inv_scale = 1.0f / output_scale;
+    const float32x4_t quant_max_vec    = vdupq_n_f32(255.0f);
+    const float32x4_t quant_min_vec    = vdupq_n_f32(0.0f);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const uint8_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
+
+            uint32x4_t sum_vec    = vdupq_n_u32(0);
+            uint32x4_t sum_sq_vec = vdupq_n_u32(0);
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data         = vld1q_u8(in_ptr + x);
+                sum_vec                       = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
+                const uint16x8_t squares_low  = vmull_u8(vget_low_u8(data), vget_low_u8(data));
+                const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
+                sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
+            }
+
+#ifdef __aarch64__
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            uint32_t sum    = vgetq_lane_u32(sum_vec, 0);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
+#elif __arm__ // #ifdef __aarch64__
+            uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) +
+                           vgetq_lane_u32(sum_vec, 3);
+
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) +
+                              vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3);
+#endif        // #ifdef __aarch64__
+            for (; x < window_end_x; ++x)
+            {
+                auto data = static_cast<uint32_t>(*(in_ptr + x));
+                sum += data;
+                sum_sq += (data * data);
+            }
+
+            const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
+            const float var =
+                (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
+            const float       stdev_inv = 1.0f / sqrtf(var + epsilon);
+            const float32x4_t v_scale   = vdupq_n_f32(stdev_inv * output_inv_scale);
+            const float32x4_t v_offset  = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data = vld1q_u8(in_ptr + x);
+                float32x4_t      db1  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db2  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db3  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
+                float32x4_t      db4  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
+                db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
+                vst1q_u8(out_ptr + x, out);
+            }
+
+            for (; x < window_end_x; ++x)
+            {
+                auto          data = static_cast<float32_t>(*(in_ptr + x));
+                const uint8_t res =
+                    data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
+                *(out_ptr + x) = res;
+            }
+        },
+        input_itr, output_itr);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/list.h b/src/cpu/kernels/meanstddevnorm/list.h
new file mode 100644
index 0000000000..6277d65884
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/list.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H
+#define SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_MEANSTDDEVNORM_KERNEL(func_name) \
+    void func_name(ITensor *input, ITensor *output, float epsilon, const Window &window)
+
+DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp32_meanstddevnorm);
+DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp16_meanstddevnorm);
+DECLARE_MEANSTDDEVNORM_KERNEL(neon_qasymm8_meanstddevnorm);
+
+#undef DECLARE_MEANSTDDEVNORM_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H
diff --git a/src/cpu/kernels/mul/generic/neon/fp16.cpp b/src/cpu/kernels/mul/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..920f298527
--- /dev/null
+++ b/src/cpu/kernels/mul/generic/neon/fp16.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    constexpr int window_step_x         = 16;
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator dst(out, win);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
+                const auto broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
+                const float16x8x2_t broadcast_value_vec = {{
+                    vdupq_n_f16(broadcast_value),
+                    vdupq_n_f16(broadcast_value),
+                }};
+                const auto          scale_vec           = vdupq_n_f16(scale);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float16x8x2_t non_broadcast_v = {{
+                        vld1q_f16(non_broadcast_input_ptr + x),
+                        vld1q_f16(non_broadcast_input_ptr + x + 8),
+                    }};
+                    const float16x8x2_t result          = {{
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
+    }
+    else
+    {
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Iterator input1(src1, input1_win);
+        Iterator input2(src2, input2_win);
+        Iterator dst(out, win);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float16x8x2_t ta1       = {{
+                              vld1q_f16(input1_ptr + x),
+                              vld1q_f16(input1_ptr + x + 8),
+                    }};
+                    const float16x8x2_t ta2       = {{
+                              vld1q_f16(input2_ptr + x),
+                              vld1q_f16(input2_ptr + x + 8),
+                    }};
+                    const float16x8_t   scale_vec = vdupq_n_f16(scale);
+                    const float16x8x2_t result    = {{
+                           vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+                           vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/mul/generic/neon/fp32.cpp b/src/cpu/kernels/mul/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..3001eb5110
--- /dev/null
+++ b/src/cpu/kernels/mul/generic/neon/fp32.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(float);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
+
+    using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
+
+                const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
+                const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+                const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src1, input1_win);
+        Iterator input2(src2, input2_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto ta1       = wrapper::vloadq(input1_ptr + x);
+                    const auto ta2       = wrapper::vloadq(input2_ptr + x);
+                    const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+                    const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/mul/generic/neon/list.h b/src/cpu/kernels/mul/generic/neon/list.h
new file mode 100644
index 0000000000..710cb68b72
--- /dev/null
+++ b/src/cpu/kernels/mul/generic/neon/list.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_MUL_KERNEL(func_name) \
+    void func_name(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+
+DECLARE_MUL_KERNEL(mul_F32_F32_F32);
+DECLARE_MUL_KERNEL(mul_F16_F16_F16);
+#undef DECLARE_MUL_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..f85fe7a31a
--- /dev/null
+++ b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void neon_normalize_float16_8_0_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 0, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float16_8_0(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 0, false>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float16_8_1_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 1, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float16_8_1(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 1, false>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float16_8_2(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 2, false>(window, in, in_squared, out, ninfo);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..0b64f46956
--- /dev/null
+++ b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_normalize_float32_4_0_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 0, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_0(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 0, false>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_1_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 1, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_1(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 1, false>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_2(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 2, false>(window, in, in_squared, out, ninfo);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/norm_layer/generic/neon/impl.h b/src/cpu/kernels/norm_layer/generic/neon/impl.h
new file mode 100644
index 0000000000..6103165679
--- /dev/null
+++ b/src/cpu/kernels/norm_layer/generic/neon/impl.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/NormalizationHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+/** Function to perform normalization depending on the given template
+ *  dimension. The second template parameter specifies whether the
+ *  normalization has to be 1D or 2D.
+ *
+ * @note Only supported normalizations are:
+ *  - 1D over X or Z
+ *  - 2D over X and Y
+ *
+ * @param[in] window     Region on which to execute the kernel.
+ * @param[in] in         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+ *                       and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] in_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+ *                       Data type and layout supported: same as @p input.
+ * @param[in] out        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
+ * @param[in] ninfo      Normalization layer information like the normalization type, normalization size and other parameters.
+ */
+template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
+void normalize_float(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = S;
+
+    Iterator input(in, win);
+    Iterator input_squared(in_squared, win);
+    Iterator output(out, win);
+
+    const int dim_y                      = in->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
+    const int radius                     = ninfo.norm_size() / 2;
+    const int input_squared_stride_x     = in_squared->info()->strides_in_bytes()[0];
+    const int input_squared_stride_slice = in_squared->info()->strides_in_bytes()[dim];
+    const int input_squared_stride_row   = in_squared->info()->strides_in_bytes()[dim_y];
+
+    const int max_right  = in->info()->dimension(dim) - 1;
+    const int max_bottom = in->info()->dimension(dim_y) - 1;
+
+    const auto coeff_vec = wrapper::vdup_n(static_cast<T>(ninfo.scale_coeff()), ExactTagType{});
+    const auto beta_vec  = wrapper::vdup_n(static_cast<T>(ninfo.beta()), ExactTagType{});
+    const auto kappa_vec = wrapper::vdup_n(static_cast<T>(ninfo.kappa()), ExactTagType{});
+
+    auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
+                                        const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
+                                        T *output_ptr)
+    {
+        const int current_slice = dim == 0 ? x : id[dim];
+        const int first_slice   = std::max(current_slice - radius, 0);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
+        // Accumulate 2D In-Map values
+        auto accu = static_cast<T>(0.f);
+        for (int j = first_row; j <= last_row; ++j)
+        {
+            // Compute row displacement
+            const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+            for (int i = first_slice; i <= last_slice; ++i)
+            {
+                accu +=
+                    *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
+            }
+        }
+
+        // Normalize
+        const auto normalized =
+            std::pow(accu * static_cast<T>(ninfo.scale_coeff()) + static_cast<T>(ninfo.kappa()), ninfo.beta());
+        const auto normalized_pixel = (*(input_ptr + x)) / normalized;
+        *(output_ptr + x)           = normalized_pixel;
+    };
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            // Get range to normalize
+            const int current_row = do_2D_norm ? id[dim_y] : 0;
+            const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+            const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+
+            int x = window_start_x;
+            // Compute serially starting elements for the case x dimension is width
+            for (; x < radius && x < window_end_x && dim == 0; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
+
+            // Compute vectorized
+            for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
+            {
+                const int current_slice = dim == 0 ? x : id[dim];
+                const int first_slice   = std::max(current_slice - radius, 0);
+                const int last_slice    = std::min(current_slice + radius, max_right);
+
+                const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+                // Accumulate 2D In-Map values
+                auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                for (int j = first_row; j <= last_row; ++j)
+                {
+                    // Compute row displacement
+                    const uint8_t *const input_squared_ptr =
+                        input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+                    for (int i = first_slice; i <= last_slice; ++i)
+                    {
+                        accu = wrapper::vadd(
+                            accu, wrapper::vloadq(reinterpret_cast<const T *>(
+                                      input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                    }
+                }
+
+                // Normalize
+                const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+                const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+                wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
+        },
+        input, input_squared, output);
+}
+
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/norm_layer/generic/neon/list.h b/src/cpu/kernels/norm_layer/generic/neon/list.h
new file mode 100644
index 0000000000..f2e83d7af1
--- /dev/null
+++ b/src/cpu/kernels/norm_layer/generic/neon/list.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+
+#define DECLARE_NORMALIZATION_KERNEL(func_name)                                                      \
+    void func_name(const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, \
+                   NormalizationLayerInfo ninfo)
+
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_2);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_2);
+
+#undef DECLARE_NORMALIZATION_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
new file mode 100644
index 0000000000..9d24d79afb
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/impl.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifdef ENABLE_NCHW_KERNELS
+
+namespace
+{
+float16x4_t
+read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
+{
+    float16_t  vec[4];
+    const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
+    for (int i = 0; i < 4; i++)
+    {
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        {
+            vec[i] = *(ptr + i);
+        }
+        else
+        {
+            vec[i] = fval;
+        }
+    }
+    return wrapper::vload(vec);
+}
+} // namespace
+
+void pooling3_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    constexpr const int pool_size            = 3;
+    const int           pool_pad_right       = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top         = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left        = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom      = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x        = 0;
+    int                 pool_stride_y        = 0;
+    std::tie(pool_stride_x, pool_stride_y)   = pool_info.pad_stride_info.stride();
+    const int                  src_w         = src->info()->dimension(0);
+    const int                  src_h         = src->info()->dimension(1);
+    const int                  upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int                  upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t            fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t            fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
+    const unsigned char *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const unsigned char *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto  x_val   = id.x() * pool_stride_x;
+            const auto  y_val_0 = id.y() * pool_stride_y;
+            const auto  y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto  y_val_2 = (id.y() * pool_stride_y) + 2;
+            float16x4_t top_data =
+                read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                           reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
+            float16x4_t middle_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
+            float16x4_t bottom_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
+            float16x4_t res = {};
+
+            // Get power of 2 in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                top_data    = vmul_f16(top_data, top_data);
+                middle_data = vmul_f16(middle_data, middle_data);
+                bottom_data = vmul_f16(bottom_data, bottom_data);
+            }
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h,
+                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                const float16x4_t scale_v = vdup_n_f16(scale);
+                // Perform pooling
+                const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+                res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+                res                        = vmul_f16(vpadd_f16(res, res), scale_v);
+            }
+            else
+            {
+                const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+                res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
+                res                        = vpmax_f16(res, res);
+            }
+
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = vsqrt_f16(res);
+            }
+
+            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+        },
+        in, out);
+}
+#endif // ENABLE_NCHW_KERNELS
+
+void pooling2_f16_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
+{
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 8;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, window_src);
+    Iterator out(dst0, window_out);
+    Iterator indices(dst1, window_out);
+
+    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+
+    const int pad_right      = src->info()->padding().right;
+    const int pad_left       = src->info()->padding().left;
+    const int pad_horizontal = pad_right + pad_left;
+    const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
+                const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
+                const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
+                const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
+                const auto  v_x0      = vld1q_f16(in_x0_ptr);
+                const auto  v_x1      = vld1q_f16(in_x1_ptr);
+                const auto  v_x2      = vld1q_f16(in_x2_ptr);
+                const auto  v_x3      = vld1q_f16(in_x3_ptr);
+                float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
+
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7};
+                const uint16x8_t voffset_x0   = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
+                const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7};
+                const uint16x8_t voffset_x1   = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
+                const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7};
+                const uint16x8_t voffset_x2   = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
+                const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7};
+                const uint16x8_t voffset_x3   = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
+                const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint16x8_t tmp_indices2 =
+                    vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+                const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
+                const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
+                // Store indicies
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
+                float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
+
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
+}
+#ifdef ENABLE_NCHW_KERNELS
+
+void pooling2_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
+    {
+        pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
+    }
+    else
+    {
+        Iterator      in(src, window_src);
+        Iterator      out(dst0, window);
+        constexpr int pool_size       = 2;
+        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x, pool_stride_y = 0;
+        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+        const int       src_w                  = src->info()->dimension(0);
+        const int       src_h                  = src->info()->dimension(1);
+        const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+        const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+        const unsigned char *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const unsigned char *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
+
+                const auto  x_val       = id.x() * pool_stride_x;
+                const auto  y_val_0     = id.y() * pool_stride_y;
+                const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
+                float16x4_t top_data    = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_0, in_top_ptr, fill_value);
+                float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_1, in_bottom_ptr, fill_value);
+                float16x4_t res         = {};
+
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f16(top_data, top_data);
+                    bottom_data = vmul_f16(bottom_data, bottom_data);
+                }
+
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x4_t scale_v = vdup_n_f16(scale);
+
+                    const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
+                    res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
+                }
+                else
+                {
+                    const float16x4_t max_data = vmax_f16(top_data, bottom_data);
+                    res                        = vpmax_f16(max_data, max_data);
+                }
+
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    res = vsqrt_f16(res);
+                }
+
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+            },
+            in, out);
+    }
+}
+
+void poolingMxN_fp16_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int       src_w                  = src->info()->dimension(0);
+    const int       src_h                  = src->info()->dimension(1);
+    const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            float16_t res = 0.0f;
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                const float16_t scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+
+                        if (pool_info.pool_type == PoolingType::L2)
+                        {
+                            data *= data;
+                        }
+
+                        res += data;
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+            }
+            else // if max pooling
+            {
+                res = fp16_min;
+
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
+                    }
+                }
+            }
+
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = std::sqrt(res);
+            }
+
+            // Store result
+            *(reinterpret_cast<float16_t *>(out.ptr())) = res;
+        },
+        in, out);
+}
+#endif // ENABLE_NCHW_KERNELS
+
+void poolingMxN_fp16_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
+{
+    if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
+    {
+        pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
+    }
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 8;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, window_src);
+    Iterator out(dst0, window_out);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int       upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int       upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t min_value     = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    float16x8_t     vres;
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x8_t scale_v = vdupq_n_f16(scale);
+
+                    // Perform pooling
+                    vres = vdupq_n_f16(0.0f);
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                vres = vaddq_f16(vres, vmulq_f16(data, data));
+                            }
+                            else
+                            {
+                                vres = vaddq_f16(vres, data);
+                            }
+                        }
+                    }
+                    // Divide by scale
+                    vres = vmulq_f16(vres, scale_v);
+                }
+                else
+                {
+                    vres = vdupq_n_f16(min_value);
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = vmaxq_f16(vres, data);
+                        }
+                    }
+                }
+
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+                    vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal),
+                                                     sqrt_reciprocal));
+                }
+
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                float16_t res = 0.0f;
+
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    const float16_t scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                res += data * data;
+                            }
+                            else
+                            {
+                                res += data;
+                            }
+                        }
+                    }
+
+                    // Divide by scale
+                    res *= scale;
+                }
+                else
+                {
+                    res = min_value;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float16_t data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
+                        }
+                    }
+                }
+
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    res = std::sqrt(res);
+                }
+
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+            }
+        },
+        in, out);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
new file mode 100644
index 0000000000..aaa37863cb
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+void pooling2_f32_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
+{
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 4;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, window_src);
+    Iterator out(dst0, window_out);
+    Iterator indices(dst1, window_out);
+
+    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+
+    float32x4_t vres;
+    float       res;
+
+    const int pad_right      = src->info()->padding().right;
+    const int pad_left       = src->info()->padding().left;
+    const int pad_horizontal = pad_right + pad_left;
+    const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
+                const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
+                const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
+                const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
+                const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
+                const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
+                const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
+                const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
+                vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32x4_t voffset_x0   = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x1   = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x2   = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x3   = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint32x4_t tmp_indices2 =
+                    vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+
+                // Store indices
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
+                res           = std::max(std::max(x2, x3), std::max(x0, x1));
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
+}
+} // namespace
+
+void poolingMxN_fp32_neon_nhwc_kernel_indices(
+    const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
+{
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
+    constexpr int window_step_x  = 4;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst0, window_out);
+    Iterator indices(dst1, window_out);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+
+    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+
+    const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
+
+    float32x4_t vres;
+    uint32x4_t  vidx;
+
+    constexpr int idx_width  = 1;
+    constexpr int idx_height = 2;
+    constexpr int idx_batch  = 3;
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[idx_batch]);
+
+    const int input_dim_w = src->info()->dimension(idx_width);
+    const int input_dim_h = src->info()->dimension(idx_height);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+
+            const int pool_start_x = std::max(0, -idx_width);
+            const int pool_start_y = std::max(0, -idx_height);
+
+            const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
+            const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
+
+            const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
+            const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                vres                             = vdupq_n_f32(min_value);
+                vidx                             = vdupq_n_u32(0U);
+                const uint8_t *in_ptr_y          = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                uint32_t       curr_kernel_index = pool_size_x * pool_start_y;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    curr_kernel_index += pool_start_x;
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const float32x4_t data      = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
+                        const uint32x4_t  vidx_curr = vdupq_n_u32(curr_kernel_index);
+                        const uint32x4_t  idxMask   = vcgtq_f32(data, vres);
+                        vidx                        = vbslq_u32(idxMask, vidx_curr, vidx);
+                        vres                        = vmaxq_f32(vres, data);
+                        in_ptr_x += y_stride;
+                        curr_kernel_index++;
+                    }
+                    curr_kernel_index += (pool_size_x - pool_end_x);
+                    in_ptr_y += z_stride;
+                }
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                float          res      = min_value;
+                uint32_t       idx      = 0U;
+                const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const float data = *(reinterpret_cast<const float *>(in_ptr_x));
+                        if (data > res)
+                        {
+                            idx = pool_size_x * y + x;
+                            res = data;
+                        }
+                        in_ptr_x += y_stride;
+                    }
+                    in_ptr_y += z_stride;
+                }
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off)        = res;
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
+            }
+        },
+        out, indices);
+}
+
+void poolingMxN_fp32_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
+{
+    if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
+    {
+        poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window);
+    }
+    else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX &&
+             !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
+    {
+        pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
+    }
+    else
+    {
+        const int window_start_x = window.x().start();
+        const int window_end_x   = window.x().end();
+        const int window_step_x  = 4;
+
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator in(src, window_src);
+        Iterator out(dst0, window_out);
+
+        const int pool_size_x =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+        const int pool_size_y =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+        const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+        const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+        const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+        const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+        int       pool_stride_x                = 0;
+        int       pool_stride_y                = 0;
+        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+        const int   upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int   upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
+        float32x4_t vres;
+
+        execute_window_loop(
+            window_out,
+            [&](const Coordinates &id)
+            {
+                const int idx_width    = id.y() * pool_stride_x;
+                const int idx_height   = id.z() * pool_stride_y;
+                const int pool_limit_y = pool_pad_top - idx_height;
+                const int pool_limit_x = pool_pad_left - idx_width;
+
+                const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+                const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+                const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+                const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+                int x_off = window_start_x;
+                for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+                {
+                    if (pool_info.pool_type != PoolingType::MAX)
+                    {
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
+
+                        // Perform pooling
+                        vres = vdupq_n_f32(0.0f);
+
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    vres = vmlaq_f32(vres, data, data);
+                                }
+                                else
+                                {
+                                    vres = vaddq_f32(vres, data);
+                                }
+                            }
+                        }
+                        // Divide by scale
+                        vres = vmulq_f32(vres, scale_v);
+                    }
+                    else
+                    {
+                        vres = vdupq_n_f32(min_value);
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+                                vres = vmaxq_f32(vres, data);
+                            }
+                        }
+                    }
+
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        float32x4_t l2_res = {static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))};
+                        vres               = l2_res;
+                    }
+
+                    // Store result
+                    vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                }
+
+                // Left-overs loop
+                for (; x_off < window_end_x; ++x_off)
+                {
+                    float res = 0.0f;
+
+                    if (pool_info.pool_type != PoolingType::MAX)
+                    {
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    res += data * data;
+                                }
+                                else
+                                {
+                                    res += data;
+                                }
+                            }
+                        }
+
+                        // Divide by scale
+                        res *= scale;
+                    }
+                    else
+                    {
+                        res = min_value;
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+                                res = std::max(res, data);
+                            }
+                        }
+                    }
+
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        res = std::sqrt(res);
+                    }
+
+                    // Store result
+                    *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+                }
+            },
+            in, out);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/impl.h b/src/cpu/kernels/pool2d/neon/impl.h
new file mode 100644
index 0000000000..008cf651e1
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/impl.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+#include <limits>
+
+#ifdef ENABLE_NCHW_KERNELS
+namespace arm_compute
+{
+namespace cpu
+{
+
+namespace
+{
+template <typename T>
+auto read_2_boundary_aware_as_f32(int srcw, int srch, int pad_l, int pad_t, int x, int y, const T *ptr, T fval)
+{
+    T          vec[2];
+    const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
+    for (int i = 0; i < 2; i++)
+    {
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        {
+            vec[i] = *(ptr + i);
+        }
+        else
+        {
+            vec[i] = fval;
+        }
+    }
+    float32_t vec_f32[2] = {vec[0], vec[1]};
+    return wrapper::vload(vec_f32);
+}
+} // namespace
+
+template <typename T>
+void pooling2_nchw_maxpool_indices(const ITensor    *src,
+                                   ITensor          *dst0,
+                                   ITensor          *dst1,
+                                   PoolingLayerInfo &pool_info,
+                                   const Window     &window_src,
+                                   const Window     &window)
+{
+    Iterator  in(src, window_src);
+    Iterator  out(dst0, window);
+    Iterator  indices(dst1, window);
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int            src_w             = src->info()->dimension(0);
+    const int            src_h             = src->info()->dimension(1);
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const int pad_left    = src->info()->padding().left;
+    const int pad_right   = src->info()->padding().right;
+    const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
+    const T   float_min   = get_initial_min<T>(pool_info.use_inf_as_limit);
+    const T   fill_value  = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            auto       top_data =
+                read_2_boundary_aware_as_f32(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                             reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                read_2_boundary_aware_as_f32(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                             reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
+            const float32x2_t max_data_top      = vpmax_f32(top_data, top_data);
+            const float32x2_t max_data_bottom   = vpmax_f32(bottom_data, bottom_data);
+            const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
+            *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+
+            // Calculate max data indice, which will be used in max unpool.
+            const uint32_t offset_base =
+                offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
+            const uint32_t   offset_top     = (uint32_t)(offset_base / sizeof(T));
+            const uint32_t   offset_bottom  = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
+            const uint32x2_t voffset_top    = {offset_top, offset_top + 1u};
+            const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u};
+            const uint32x2_t tmp_indices_top =
+                vbsl_u32(vcge_f32(top_data, vrev64_f32(top_data)), voffset_top, vrev64_u32(voffset_top));
+            const uint32x2_t tmp_indices_bottom =
+                vbsl_u32(vcge_f32(bottom_data, vrev64_f32(bottom_data)), voffset_bottom, vrev64_u32(voffset_bottom));
+            *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(
+                vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
+        },
+        in, out, indices);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ENABLE_NCHW_KERNELS
+
+#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H
diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h
new file mode 100644
index 0000000000..5db843d56b
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/list.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/pool2d/neon/quantized.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_POOLING_KERNEL(func_name)                                                                           \
+    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \
+                   const Window &window)
+
+DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
+DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
+DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nhwc);
+DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc);
+
+#if defined(ENABLE_NCHW_KERNELS)
+
+#if defined(ENABLE_FP16_KERNELS)
+DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw);
+DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw);
+DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw);
+#endif /* defined(ENABLE_FP16_KERNELS) */
+
+DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw);
+DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw);
+DECLARE_POOLING_KERNEL(pooling7_fp32_neon_nchw);
+DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nchw);
+#endif /* defined(ENABLE_NCHW_KERNELS) */
+
+#undef DECLARE_POOLING_KERNEL
+
+template <typename T>
+T get_initial_min(bool use_inf_as_limit)
+{
+    return use_inf_as_limit ? -std::numeric_limits<T>::infinity() : std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+inline uint32_t offset_no_padding(uint32_t           padded_offset,
+                                  const Coordinates &id,
+                                  const ITensorInfo &info,
+                                  int                pool_stride_x,
+                                  int                pool_stride_y,
+                                  DataLayout         data_layout)
+{
+    const int pad_left    = info.padding().left;
+    const int pad_right   = info.padding().right;
+    const int pad_top     = info.padding().top;
+    const int pad_bottom  = info.padding().bottom;
+    const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
+    const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
+    const int pad_horiz   = pad_left + pad_right;
+    const int pad_vert    = pad_top + pad_bottom;
+
+    if (data_layout == DataLayout::NCHW)
+    {
+        const uint32_t offset_base =
+            padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
+            - pad_top * sizeof(T)                                          /* top padding */
+            - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() -
+            pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
+            - in_stride_w * id[3];
+
+        return offset_base;
+    }
+    else
+    {
+        const uint32_t offset_base = padded_offset -
+                                     sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
+                                     - pad_top * sizeof(T)                          // top padding
+                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() *
+                                           pool_stride_y // for each Z plane there are width*pad_right padding elems
+                                     - in_stride_w * id[3];
+
+        return offset_base;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H
diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
new file mode 100644
index 0000000000..0602bea667
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/impl.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+#include <limits>
+
+#ifdef ENABLE_NCHW_KERNELS
+namespace arm_compute
+{
+namespace cpu
+{
+#define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
+    (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr)
+#define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
+    (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1)              \
+                        : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)                   \
+    ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \
+        ? vdup_n_f32(fval)                                                                         \
+        : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+
+#define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)           \
+    vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \
+                 READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval))
+
+float32x4x2_t
+read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
+{
+    float32x4x2_t vec;
+    vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval);
+    vec.val[1] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 4), y, (ptr + 4), fval);
+    return vec;
+}
+
+void poolingMxN_fp32_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            float res = 0.0f;
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+
+                        if (pool_info.pool_type == PoolingType::L2)
+                        {
+                            data *= data;
+                        }
+
+                        res += data;
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+            }
+            else // if max pooling
+            {
+                res = min_value;
+
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
+                    }
+                }
+            }
+
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = std::sqrt(res);
+            }
+
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = res;
+        },
+        in, out);
+}
+
+void pooling2_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
+    {
+        pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
+    }
+    else
+    {
+        Iterator      in(src, window_src);
+        Iterator      out(dst0, window);
+        constexpr int pool_size                = 2;
+        const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x            = 0;
+        int           pool_stride_y            = 0;
+        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+        const int   src_w                      = src->info()->dimension(0);
+        const int   src_h                      = src->info()->dimension(1);
+        const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+        const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+        const uint8_t *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const uint8_t *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
+
+                const auto x_val      = id.x() * pool_stride_x;
+                const auto y_val_0    = id.y() * pool_stride_y;
+                const auto y_val_1    = (id.y() * pool_stride_y) + 1;
+                auto       top_data   = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                                              in_top_ptr, fill_value);
+                auto bottom_data      = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                              in_bottom_ptr, fill_value);
+                float32x2_t res       = {};
+                float       final_res = 0;
+
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f32(top_data, top_data);
+                    bottom_data = vmul_f32(bottom_data, bottom_data);
+                }
+
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                             pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                             pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float32x2_t scale_v = vdup_n_f32(scale);
+
+                    // Perform pooling
+                    const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+                    res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+                }
+                else
+                {
+                    const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+                    res                        = vpmax_f32(max_data, max_data);
+                }
+                final_res = vget_lane_f32(res, 0);
+
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    final_res = sqrt(final_res);
+                }
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr())) = final_res;
+            },
+            in, out);
+    }
+}
+
+void pooling3_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    constexpr const int pool_size          = 3;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+            const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
+            const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
+
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+            auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr,
+                                                  fill_value);
+            auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                     in_middle_ptr, fill_value);
+            auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                                                     in_bottom_ptr, fill_value);
+
+            float32x2_t res       = {};
+            float       final_res = 0;
+
+            // Get power of 2 in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                top_data    = vmulq_f32(top_data, top_data);
+                middle_data = vmulq_f32(middle_data, middle_data);
+                bottom_data = vmulq_f32(bottom_data, bottom_data);
+            }
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
+                const float32x2_t scale_v = vdup_n_f32(scale);
+
+                // Perform pooling
+                const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+                res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
+            }
+            else
+            {
+                const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
+                res = vpmax_f32(res, res);
+            }
+            final_res = vget_lane_f32(res, 0);
+
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                final_res = sqrt(final_res);
+            }
+
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = final_res;
+        },
+        in, out);
+}
+
+void pooling7_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    constexpr const int pool_size          = 7;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    std::array<const uint8_t *, pool_size> src_ptrs{{}};
+    for (int i = 0; i < pool_size; ++i)
+    {
+        src_ptrs[i] =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+    }
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
+
+            auto          x_val = id.x() * pool_stride_x;
+            auto          y_val = id.y() * pool_stride_y;
+            float32x4x2_t data =
+                read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
+
+            float32x2_t res       = {};
+            float       final_res = 0.f;
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
+                const float32x2_t scale_v = vdup_n_f32(scale);
+
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                }
+                float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+
+                    x_val = id.x() * pool_stride_x;
+                    y_val = (id.y() * pool_stride_y) + i;
+                    data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr,
+                                                  fill_value);
+                    // Get power of 2 in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                        data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                    }
+                    sum_data = vaddq_f32(sum_data, data.val[0]);
+                    sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                }
+                res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
+            }
+            else
+            {
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+
+                    x_val              = id.x() * pool_stride_x;
+                    y_val              = (id.y() * pool_stride_y) + i;
+                    float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val,
+                                                               in_ptr, fill_value);
+                    data               = vmax2q_f32(data, temp);
+                }
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
+                res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
+                res = vpmax_f32(res, res);
+            }
+            final_res = vget_lane_f32(res, 0);
+
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                final_res = sqrt(final_res);
+            }
+
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = final_res;
+        },
+        in, out);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ENABLE_NCHW_KERNELS
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
new file mode 100644
index 0000000000..44675b5394
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void poolingMxN_qasymm8_neon_nhwc(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
+{
+    poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..d434323e89
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor    *src,
+                                         ITensor          *dst0,
+                                         ITensor          *dst1,
+                                         PoolingLayerInfo &pool_info,
+                                         const Window     &window_src,
+                                         const Window     &window)
+{
+    poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
new file mode 100644
index 0000000000..38f1b2f1f9
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/quantized.h
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
+#define SRC_CORE_NEON_KERNELS_QUANTIZED_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/PoolingHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void poolingMxN_q8_neon_nhwc(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+
+    const int window_start_x     = window.x().start();
+    const int window_end_x       = window.x().end();
+    const int window_step_x      = 16;
+    const int window_half_step_x = window_step_x / 2;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, window_src);
+    Iterator out(dst0, window_out);
+
+    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q16_t   = typename wrapper::traits::promote_t<T>;
+    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
+    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
+    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+
+    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
+    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
+
+    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
+    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
+    // With a requantization performed in a single step there won't be uncertainties introduced
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
+                    }
+
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float32x4x4_t vres = {{
+                            vcvtq_f32_q32(vres1),
+                            vcvtq_f32_q32(vres2),
+                            vcvtq_f32_q32(vres3),
+                            vcvtq_f32_q32(vres4),
+                        }};
+                        const auto          requantized_dst =
+                            vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8,
+                                        wrapper::vgethigh(requantized_dst));
+                    }
+                    else
+                    {
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                        vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                        vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                        vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                        const q8x8_t res1 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                        const q8x8_t res2 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                    }
+                }
+                else
+                {
+                    q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
+                        }
+                    }
+
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo)
+                                        ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                               wrapper::vgethigh(vres), requant_qinfo)
+                                        : vres);
+                }
+            }
+
+            if (pool_info.pool_type == PoolingType::MAX)
+            {
+                for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+                {
+                    q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x8_t data = wrapper::vload(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
+                        }
+                    }
+
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+                }
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    q32_t res = static_cast<q32_t>(0.f);
+
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res += data;
+                        }
+                    }
+
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float res_f          = static_cast<float>(res);
+                        const float new_scale      = quant_rescale / scale;
+                        const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                    }
+                    else
+                    {
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    }
+                }
+                else
+                {
+                    T res = std::numeric_limits<T>::min();
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
+                        }
+                    }
+
+                    // Store result
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float res_f                           = static_cast<float>(res);
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    }
+                }
+            }
+        },
+        in, out);
+}
+
+#if defined(ENABLE_NCHW_KERNELS)
+template <typename T, typename TVec>
+inline void scale_vector_q16x8(bool               exclude_padding,
+                               TVec              &v,
+                               const Coordinates &id,
+                               int                id_offset,
+                               int                step,
+                               const int          pool_size,
+                               const int          upper_bound_w,
+                               const int          upper_bound_h,
+                               const int          pad_x,
+                               const int          pad_y,
+                               const int          stride_x,
+                               const int          stride_y)
+{
+    int       start_x = (id.x() + id_offset) * stride_x - pad_x;
+    int       start_y = id.y() * stride_y - pad_y;
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    if (exclude_padding)
+    {
+        start_y = std::max(0, start_y);
+    }
+
+    std::array<T, 8> elems = {{
+        wrapper::vgetlane(v, 0),
+        wrapper::vgetlane(v, 1),
+        wrapper::vgetlane(v, 2),
+        wrapper::vgetlane(v, 3),
+        wrapper::vgetlane(v, 4),
+        wrapper::vgetlane(v, 5),
+        wrapper::vgetlane(v, 6),
+        wrapper::vgetlane(v, 7),
+    }};
+
+    for (auto &el : elems)
+    {
+        int       c_start_x = start_x;
+        const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
+        if (exclude_padding)
+        {
+            c_start_x = std::max(0, c_start_x);
+        }
+        float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
+        el *= scale;
+        start_x += step * stride_x;
+    }
+
+    v = wrapper::vsetlane(elems[0], v, 0);
+    v = wrapper::vsetlane(elems[1], v, 1);
+    v = wrapper::vsetlane(elems[2], v, 2);
+    v = wrapper::vsetlane(elems[3], v, 3);
+    v = wrapper::vsetlane(elems[4], v, 4);
+    v = wrapper::vsetlane(elems[5], v, 5);
+    v = wrapper::vsetlane(elems[6], v, 6);
+    v = wrapper::vsetlane(elems[7], v, 7);
+}
+
+template <typename T>
+auto load16_boundary_aware(
+    int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
+{
+    ARM_COMPUTE_UNUSED(pad_b, pad_r);
+    T vec[16];
+    //handle reading a row out of the tensor
+    const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
+    for (int i = 0; i < 16; i++)
+    {
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        {
+            vec[i] = *(ptr + i);
+        }
+        else
+        {
+            vec[i] = fval;
+        }
+    }
+    return wrapper::vloadq(vec);
+}
+
+template <typename T, typename V, bool deinterleave>
+inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
+{
+    if (deinterleave)
+    {
+        for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
+        {
+            *(ptr + i * 2) = lower[i];
+        }
+        for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
+        {
+            *(ptr + 1 + i * 2) = upper[i];
+        }
+    }
+    else
+    {
+        for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
+        {
+            *(ptr + i) = lower[i];
+        }
+        for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
+        {
+            *(ptr + i + 8) = upper[i];
+        }
+    }
+}
+
+template <typename T, typename V>
+inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
+{
+    for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
+    {
+        *(ptr + i) = v[i];
+    }
+}
+
+template <typename T>
+void pooling2_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    /** SIMD vector types */
+    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q16_t     = typename wrapper::traits::promote_t<T>;
+    using q16x4_t   = typename wrapper::traits::neon_vector<q16_t, 4>::type;
+    using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
+    using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
+
+    constexpr int pool_size                = 2;
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int      upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int      upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const T *const src_top_ptr   = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const int                     scale_step_x         = (pool_stride_x == 1) ? 2 : 1;
+    const UniformQuantizationInfo src_qinfo            = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo            = dst0->info()->quantization_info().uniform();
+    const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
+
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+    const int                     src_w         = src->info()->dimension(0);
+    const int                     src_h         = src->info()->dimension(1);
+    const int                     dst_w         = dst0->info()->dimension(0);
+
+    const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            q8x8_t lower_res = {};
+            q8x8_t upper_res = {};
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Add rows
+                const q16x8x2_t vrsum = {{
+                    wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
+                    wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
+                }};
+
+                // Pair-wise add row data
+                const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
+                const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
+
+                q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
+
+                // Scale lower result
+                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size,
+                                                   upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                   pool_stride_x, pool_stride_y);
+                lower_res = wrapper::vmovn(res_lower);
+
+                // Compute upper result for stride_x == 1
+                if (pool_stride_x == 1)
+                {
+                    // Shifted row sum
+                    const q16x8x2_t vrsum_shifted = {
+                        {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+
+                    // Pair-wise add shifted row
+                    q16x8_t res_upper = wrapper::vcombine(
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]),
+                                       wrapper::vgethigh(vrsum_shifted.val[1])));
+
+                    // Scale upper result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    upper_res = wrapper::vmovn(res_upper);
+                }
+            }
+            else
+            {
+                const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
+                lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
+                if (pool_stride_x == 1)
+                {
+                    const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
+                    upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+                }
+            }
+
+            if (have_different_qinfo)
+            {
+                const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
+                lower_res                  = wrapper::vgetlow(requantized_dst);
+                upper_res                  = wrapper::vgethigh(requantized_dst);
+            }
+            auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            // Store result
+            if (pool_stride_x == 1)
+            {
+                write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
+            }
+            else
+            {
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void pooling3_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    /** SIMD vector types */
+    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
+    using q16_t     = typename wrapper::traits::promote_t<T>;
+    using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
+    using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
+
+    constexpr int pool_size                = 3;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+
+    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
+
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+
+    const T *const src_top_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_middle_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
+
+    const int src_w      = src->info()->dimension(0);
+    const int src_h      = src->info()->dimension(1);
+    const T   fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
+    const int dst_w      = dst0->info()->dimension(0);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto middle_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            q8x8_t  fres  = {};
+            q8x16_t fqres = {};
+
+            if (pool_info.pool_type == PoolingType::AVG)
+            {
+                // Convert data to u16
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t middle_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Calculate row sums
+                const q16x8x2_t vrsum           = {{
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+                }};
+                const q16x8x2_t vrsum_shifted_1 = {
+                    {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+                const q16x8x2_t vrsum_shifted_2 = {
+                    {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}};
+                // Calculate final sum
+                q16x8x2_t final_sum = {{
+                    wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
+                    wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+                }};
+                if (pool_stride_x == 2)
+                {
+                    q16x8_t res = {
+                        wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2),
+                        wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6),
+                        wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2),
+                        wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6),
+                    };
+
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fres = wrapper::vmovn(res);
+                }
+                else
+                {
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
+                }
+            }
+            else
+            {
+                const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
+                const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
+                const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
+                const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
+
+                if (pool_stride_x == 2)
+                {
+                    const q8x8x2_t      table      = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}};
+                    static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14};
+                    fres                           = wrapper::vtbl(table, lookup_val);
+                }
+                else
+                {
+                    fqres = final_max;
+                }
+            }
+
+            // Store result
+            if (pool_stride_x == 1)
+            {
+                if (src_qinfo != dst_qinfo)
+                {
+                    fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres),
+                                                                 requant_qinfo);
+                }
+                write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres),
+                                                         wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
+            }
+            else
+            {
+                if (src_qinfo != dst_qinfo)
+                {
+                    fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+                }
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void poolingMxN_quantized_neon_nchw(const ITensor    *src,
+                                    ITensor          *dst0,
+                                    ITensor          *dst1,
+                                    PoolingLayerInfo &pool_info,
+                                    const Window     &window_src,
+                                    const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    /** SIMD vector types */
+    using q16_t = typename wrapper::traits::promote_t<T>;
+    using q32_t = typename wrapper::traits::promote_t<q16_t>;
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+
+    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
+    const int                      src_w     = src->info()->dimension(0);
+    const int                      src_h     = src->info()->dimension(1);
+    const T   fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
+    const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
+    const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            T res = std::numeric_limits<T>::min();
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                q32_t sres = 0;
+
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        sres += data;
+                    }
+                }
+                // Divide by scale
+                res = static_cast<T>(support::cpp11::round(sres * scale));
+            }
+            else
+            {
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        res            = std::max(res, data);
+                    }
+                }
+            }
+            // Store result
+            res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(
+                                                                                 Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo)
+                                                                           : res;
+            *(reinterpret_cast<T *>(out.ptr())) = res;
+        },
+        in, out);
+}
+#endif /* defined(ENABLE_NCHW_KERNELS) */
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
diff --git a/src/cpu/kernels/pool3d/list.h b/src/cpu/kernels/pool3d/list.h
new file mode 100644
index 0000000000..3426360f93
--- /dev/null
+++ b/src/cpu/kernels/pool3d/list.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+#define SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_POOLING_KERNEL(func_name) \
+    void func_name(const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
+
+DECLARE_POOLING_KERNEL(neon_q8_pool3d);
+DECLARE_POOLING_KERNEL(neon_q8_signed_pool3d);
+DECLARE_POOLING_KERNEL(neon_fp16_pool3d);
+DECLARE_POOLING_KERNEL(neon_fp32_pool3d);
+
+#undef DECLARE_POOLING_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/fp16.cpp b/src/cpu/kernels/pool3d/neon/fp16.cpp
new file mode 100644
index 0000000000..0130a96098
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp16.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_fp_neon_ndhwc<float16_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/pool3d/neon/fp32.cpp b/src/cpu/kernels/pool3d/neon/fp32.cpp
new file mode 100644
index 0000000000..2c06a9d57a
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_fp_neon_ndhwc<float>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
new file mode 100644
index 0000000000..ce89199b5d
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_POOLING_3D_LAYER_IMPL_H
+#define SRC_CORE_POOLING_3D_LAYER_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/helpers/PoolingHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool3d/neon/quantized.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+template <typename T>
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
+
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmax(vres, data);
+                        }
+                    }
+                }
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
+                res = -std::numeric_limits<float>::infinity();
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res                     = std::max(res, data);
+                        }
+                    }
+                }
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
+}
+
+template <typename T>
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vadd(vres, data);
+                        }
+                    }
+                }
+
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
+}
+
+template <typename T>
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                   ITensor            *dst0,
+                                   Pooling3dLayerInfo &pool_info,
+                                   const Window       &window_out,
+                                   const int           window_start_x,
+                                   const int           window_end_x,
+                                   const int           window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmla(vres, data, data);
+                        }
+                    }
+                }
+
+                const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
+
+                // Calculate square-root
+                vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data * data;
+                        }
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+
+                // Square root
+                res = std::sqrt(res);
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
+}
+} // namespace
+
+template <typename T>
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
+    constexpr int window_step_x  = 16 / sizeof(T);
+    Window        window_out     = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch (pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
+            break;
+        case PoolingType::L2:
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                             window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
+
+template <typename T>
+void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    constexpr int window_step_x = 16;
+    Window        window_out    = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch (pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H
diff --git a/src/cpu/kernels/pool3d/neon/qasymm8.cpp b/src/cpu/kernels/pool3d/neon/qasymm8.cpp
new file mode 100644
index 0000000000..650a815e76
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/qasymm8.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_q8_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_q8_neon_ndhwc<uint8_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..374b2435ea
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_q8_signed_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_q8_neon_ndhwc<int8_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
new file mode 100644
index 0000000000..8819907901
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+#define SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/helpers/PoolingHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void avg_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
+
+{
+    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q16_t   = typename wrapper::traits::promote_t<T>;
+    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
+    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
+    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_c = src->info()->dimension(0);
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    const int window_end_x   = input_dim_c;
+    const int window_start_x = 0;
+
+    Iterator out(dst0, window_out);
+
+    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
+    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
+
+    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
+    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
+    // With a requantization performed in a single step there won't be uncertainties introduced
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
+                    }
+                }
+
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float32x4x4_t vres = {{
+                        vcvtq_f32_q32(vres1),
+                        vcvtq_f32_q32(vres2),
+                        vcvtq_f32_q32(vres3),
+                        vcvtq_f32_q32(vres4),
+                    }};
+                    const auto          requantized_dst =
+                        vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                }
+                else
+                {
+                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                }
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                q32_t res = static_cast<q32_t>(0.f);
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
+                    }
+                }
+
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f           = static_cast<float>(res);
+                    const float new_scale       = quant_rescale / scale;
+                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                }
+                else
+                {
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
+            }
+        },
+        out);
+}
+
+template <typename T>
+void max_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
+
+{
+    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+
+    const int window_half_step_x = window_step_x / 2;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+    const int input_dim_c = src->info()->dimension(0);
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    const int window_end_x   = input_dim_c;
+    const int window_start_x = 0;
+
+    Iterator out(dst0, window_out);
+
+    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
+
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
+                    }
+                }
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo)
+                                    ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                           wrapper::vgethigh(vres), requant_qinfo)
+                                    : vres);
+            }
+
+            // Leftovers using half the window step
+            for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+            {
+                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
+                    }
+                }
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res = std::numeric_limits<T>::min();
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            res = std::max(res, data);
+                        }
+                    }
+                }
+
+                // Store result
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f                           = static_cast<float>(res);
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                }
+                else
+                {
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
+            }
+        },
+        out);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
diff --git a/src/cpu/kernels/quantize/generic/neon/fp16.cpp b/src/cpu/kernels/quantize/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..37bfb5b2aa
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/fp16.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/quantize/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float16_t, uint8_t>(src, dst, window);
+}
+void fp16_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float16_t, int8_t>(src, dst, window);
+}
+void fp16_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<float16_t>(src, dst, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/quantize/generic/neon/fp32.cpp b/src/cpu/kernels/quantize/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..0cba332fd6
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/fp32.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/quantize/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float, uint8_t>(src, dst, window);
+}
+void fp32_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float, int8_t>(src, dst, window);
+}
+void fp32_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<float>(src, dst, window);
+}
+
+void fp32_i8_run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qsymm8<float, int8_t>(src, dst, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/quantize/generic/neon/impl.h b/src/cpu/kernels/quantize/generic/neon/impl.h
new file mode 100644
index 0000000000..9954a7645e
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/impl.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+constexpr auto window_step = 16;
+
+template <typename T>
+inline float32x4x4_t load_value(const T *input_ptr)
+{
+    using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
+}
+
+template <>
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+    return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8),
+            wrapper::vloadq(input_ptr + 12)};
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float32x4x4_t load_value(const float16_t *input_ptr)
+{
+    return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+            vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename element_type>
+using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
+
+template <typename quantized_type>
+inline vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
+
+template <>
+inline vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    return vquantize(qv, qi);
+}
+
+template <>
+inline vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    return vquantize_signed(qv, qi);
+}
+
+template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
+inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper));
+}
+
+template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type>
+inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper));
+}
+
+template <typename TIn, typename TOut>
+void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Calculate output offset difference.
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const wrapper::traits::neon_vector_t<TIn, window_step> qv =
+                    wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+
+                // Signed addition.
+                auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset);
+
+                // Output is dependent on datatype.
+                wrapper::vstore(&output_ptr[x],
+                                reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto result   = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    const int32_t low_bound   = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128;
+    const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127;
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto  input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const auto qv    = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+                int16x8_t  lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv)));
+                int16x8_t  upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv)));
+
+                // Signed addition.
+                lower = wrapper::vqadd(lower, offset);
+                upper = wrapper::vqadd(upper, offset);
+
+                // Output is dependent on datatype.
+                auto res = recombine_8_16<TOut>(lower, upper);
+                wrapper::vstore(&output_ptr[x], res);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                // Add offset and clamp result to within the range of the output datatype.
+                int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                result         = utility::clamp<int32_t>(result, low_bound, upper_bound);
+
+                // Cast result to output datatype.
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+    }
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
+}
+
+template <typename T>
+void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+    }
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+                vst1q_u16(&output_ptr[x], tmp.val[0]);
+                vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/quantize/generic/neon/integer.cpp b/src/cpu/kernels/quantize/generic/neon/integer.cpp
new file mode 100644
index 0000000000..4e39afaaee
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/integer.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/quantize/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void u8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<uint8_t, uint8_t>(src, dst, window);
+}
+void u8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<uint8_t, int8_t>(src, dst, window);
+}
+void i8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<int8_t, uint8_t>(src, dst, window);
+}
+void i8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<int8_t, int8_t>(src, dst, window);
+}
+
+void u8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<uint8_t>(src, dst, window);
+}
+void i8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<int8_t>(src, dst, window);
+}
+
+void u8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<uint8_t, uint8_t>(src, dst, window);
+}
+void u8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<uint8_t, int8_t>(src, dst, window);
+}
+void i8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<int8_t, uint8_t>(src, dst, window);
+}
+void i8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<int8_t, int8_t>(src, dst, window);
+}
+
+void i8_u8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only_convert<int8_t, uint8_t>(src, dst, window);
+}
+void u8_i8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only_convert<uint8_t, int8_t>(src, dst, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/quantize/generic/neon/list.h b/src/cpu/kernels/quantize/generic/neon/list.h
new file mode 100644
index 0000000000..c4fb1048eb
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/list.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#define DECLARE_QUANTIZE_KERNEL(func_name) void func_name(const ITensor *src, ITensor *dst, const Window &window)
+
+DECLARE_QUANTIZE_KERNEL(u8_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(u8_i8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(i8_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(i8_i8_run_quantize_qasymm8);
+
+DECLARE_QUANTIZE_KERNEL(u8_u8_run_requantize_offset_only);
+DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only);
+DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only);
+DECLARE_QUANTIZE_KERNEL(i8_i8_run_requantize_offset_only);
+
+DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only_convert);
+DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only_convert);
+
+DECLARE_QUANTIZE_KERNEL(u8_run_quantize_qasymm16);
+DECLARE_QUANTIZE_KERNEL(i8_run_quantize_qasymm16);
+
+DECLARE_QUANTIZE_KERNEL(fp32_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp32_run_quantize_qasymm16);
+
+DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qsymm8);
+
+DECLARE_QUANTIZE_KERNEL(fp16_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp16_i8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp16_run_quantize_qasymm16);
+
+#undef DECLARE_QUANTIZE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..505c18c27d
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/fp16.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<float16_t>(output, start, step, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..e5e472abb5
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<float32_t>(output, start, step, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h
new file mode 100644
index 0000000000..f8c30d52a0
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/impl.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
+
+    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
+    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
+    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16 / sizeof(T);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator output_it(output, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int        x       = window_start_x;
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                for (int count = 0; count < window_step_x; ++count)
+                {
+                    id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+                }
+
+                // start + step * id
+                const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+                wrapper::vstore(out_ptr + x, res_vec);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto res = start + x * step;
+                *(out_ptr + x) = res;
+            }
+        },
+        output_it);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/range/generic/neon/integer.cpp b/src/cpu/kernels/range/generic/neon/integer.cpp
new file mode 100644
index 0000000000..0f3ff89b71
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/integer.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/range/generic/neon/impl.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void u8_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<uint8_t>(output, start, step, window);
+}
+
+void u16_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<uint16_t>(output, start, step, window);
+}
+
+void u32_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<uint32_t>(output, start, step, window);
+}
+
+void s8_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<int8_t>(output, start, step, window);
+}
+
+void s16_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<int16_t>(output, start, step, window);
+}
+
+void s32_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<int32_t>(output, start, step, window);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h
new file mode 100644
index 0000000000..cade91e8dd
--- /dev/null
+++ b/src/cpu/kernels/range/list.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_RANGE_LIST_H
+#define SRC_CORE_NEON_KERNELS_RANGE_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window)
+
+DECLARE_RANGE_KERNEL(fp16_neon_range_function);
+DECLARE_RANGE_KERNEL(fp32_neon_range_function);
+DECLARE_RANGE_KERNEL(s8_neon_range_function);
+DECLARE_RANGE_KERNEL(s16_neon_range_function);
+DECLARE_RANGE_KERNEL(s32_neon_range_function);
+DECLARE_RANGE_KERNEL(u8_neon_range_function);
+DECLARE_RANGE_KERNEL(u16_neon_range_function);
+DECLARE_RANGE_KERNEL(u32_neon_range_function);
+
+#undef DECLARE_RANGE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_RANGE_LIST_H
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp b/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..143bb5487f
--- /dev/null
+++ b/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void reduce_RedOpX_reduceX_float16_8(const Window            &window,
+                                     const ITensor           *input,
+                                     ITensor                 *output,
+                                     const ReductionOperation op)
+{
+    return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
+}
+
+void reduce_RedOpYZW_reduceY_float16_8(const Window            &window,
+                                       const ITensor           *input,
+                                       ITensor                 *output,
+                                       const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
+}
+
+void reduce_RedOpYZW_reduceZ_float16_8(const Window            &window,
+                                       const ITensor           *input,
+                                       ITensor                 *output,
+                                       const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
+}
+
+void reduce_RedOpYZW_reduceW_float16_8(const Window            &window,
+                                       const ITensor           *input,
+                                       ITensor                 *output,
+                                       const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp b/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..6f5f13e571
--- /dev/null
+++ b/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM(const Window            &window,
+                                                     const ITensor           *input,
+                                                     ITensor                 *output,
+                                                     const ReductionOperation op)
+{
+    Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(
+        window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+}
+
+void reduce_RedOpX_reduceX_float32_4(const Window            &window,
+                                     const ITensor           *input,
+                                     ITensor                 *output,
+                                     const ReductionOperation op)
+{
+    return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
+}
+
+void reduce_RedOpYZW_reduceY_float32_4(const Window            &window,
+                                       const ITensor           *input,
+                                       ITensor                 *output,
+                                       const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
+}
+
+void reduce_RedOpYZW_reduceZ_float32_4(const Window            &window,
+                                       const ITensor           *input,
+                                       ITensor                 *output,
+                                       const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
+}
+
+void reduce_RedOpYZW_reduceW_float32_4(const Window            &window,
+                                       const ITensor           *input,
+                                       ITensor                 *output,
+                                       const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/impl.h b/src/cpu/kernels/reduction_layer/generic/neon/impl.h
new file mode 100644
index 0000000000..3fa821d3a4
--- /dev/null
+++ b/src/cpu/kernels/reduction_layer/generic/neon/impl.h
@@ -0,0 +1,1633 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "support/SaturateCast.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+// Helper function that calls vqmovun/vqmvn, vcombine and vstore, allows templating of RedOpYZW_quantized
+template <typename T>
+void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
+{
+    if (std::is_same<T, uint8_t>::value)
+    {
+        auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
+        wrapper::vstore(output.ptr() + offset, res);
+    }
+    else
+    {
+        auto res = wrapper::vcombine(wrapper::vqmovn(t1), wrapper::vqmovn(t2));
+        wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr() + offset), res);
+    }
+}
+
+template <typename T>
+uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4_t mask{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask = wrapper::vclt(b, a);
+    }
+
+    uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3};
+    if (axis != 0)
+    {
+        vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}};
+
+    return res;
+}
+
+template <typename T>
+uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x4_t mask{{0}};
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u8 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u8 = wrapper::vclt(b, a);
+    }
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    mask.val[0] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    mask.val[1] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    mask.val[2] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    mask.val[3] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+
+    uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3},
+                             {idx + 4, idx + 5, idx + 6, idx + 7},
+                             {idx + 8, idx + 9, idx + 10, idx + 11},
+                             {idx + 12, idx + 13, idx + 14, idx + 15}}};
+    if (axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = {
+        {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+         vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}};
+
+    return res;
+}
+
+// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
+template <typename T>
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_min(T in)
+{
+    auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+    return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
+template <typename T>
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_min(T in)
+{
+    auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+    pmin      = wrapper::vpmin(pmin, pmin);
+    pmin      = wrapper::vpmin(pmin, pmin);
+    return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
+template <typename T>
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_max(T in)
+{
+    auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+    return wrapper::vpmax(pmax, pmax);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
+template <typename T>
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_max(T in)
+{
+    auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+    pmax      = wrapper::vpmax(pmax, pmax);
+    pmax      = wrapper::vpmax(pmax, pmax);
+    return wrapper::vpmax(pmax, pmax);
+}
+
+template <typename T>
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
+{
+    uint32x4_t res_idx_mask{0};
+    uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+
+    if (op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin    = calculate_min(vec_res_value);
+        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+    else
+    {
+        auto pmax    = calculate_max(vec_res_value);
+        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+
+    res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
+    auto pmin    = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
+    pmin         = wrapper::vpmin(pmin, pmin);
+    uint32_t res = wrapper::vgetlane(pmin, 0);
+
+    return (res - 0xFFFFFFFF);
+}
+
+template <typename T>
+uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
+{
+    uint32x4x4_t res_idx_mask{{0}};
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = calculate_min(vec_res_value);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = calculate_max(vec_res_value);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    auto wide_u32_3 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    auto wide_u32_4 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
+    res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+    res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
+    res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    int      iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    } while (iter < 4);
+
+    return (res - 0xFFFFFFFF);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+uint32x4x4_t inline calculate_index(
+    uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x2_t mask{0};
+    uint16x8_t   mask_u16{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u16 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u16 = wrapper::vclt(b, a);
+    }
+    mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
+    mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
+    uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}};
+    if (axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+                        wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0};
+
+    return res;
+}
+
+// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
+inline float16x4_t calculate_min(float16x8_t in)
+{
+    auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+    pmin      = wrapper::vpmin(pmin, pmin);
+    return wrapper::vpmin(pmin, pmin);
+}
+// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
+inline float16x4_t calculate_max(float16x8_t in)
+{
+    auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+    pmax      = wrapper::vpmax(pmax, pmax);
+    return wrapper::vpmax(pmax, pmax);
+}
+
+template <>
+inline uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
+{
+    uint32x4x2_t res_idx_mask{0};
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint16x8_t   mask_u16;
+    if (op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = calculate_min(vec_res_value);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = calculate_max(vec_res_value);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    uint32_t iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    } while (iter < 2);
+
+    return (res - 0xFFFFFFFF);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <class F>
+class Reducer
+{
+public:
+    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
+    {
+        // Set out window
+        Window out_window(window);
+        out_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        f(window, out_window, input, output, op);
+    }
+    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
+    {
+        // Set in window
+        Window in_window(window);
+        Window out_window(window);
+
+        in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+        out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
+
+        f(in_window, out_window, input, output, 1, op);
+    }
+    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
+    {
+        // Set in window
+        Window in_window(window);
+        Window out_window(window);
+
+        in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
+
+        f(in_window, out_window, input, output, 2, op);
+    }
+    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
+    {
+        // Set in/out window
+        Window in_window(window);
+        Window out_window(window);
+
+        in_window.set(3, Window::Dimension(0, 1, 1));
+        out_window.set(3, Window::Dimension(0, 1, 1));
+
+        f(in_window, out_window, input, output, 3, op);
+    }
+};
+
+template <typename T, int S>
+struct RedOpX
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    {
+        const size_t input_dim_0    = in->info()->dimension(0);
+        const int    window_step_x  = 16 / sizeof(T);
+        const auto   window_start_x = static_cast<int>(in_window.x().start());
+        const auto   window_end_x   = static_cast<int>(in_window.x().end());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_window);
+
+        execute_window_loop(
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+
+                auto init_res_value = static_cast<T>(0.f);
+                switch (op)
+                {
+                    case ReductionOperation::ARG_IDX_MAX:
+                    case ReductionOperation::ARG_IDX_MIN:
+                    case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
+                    {
+                        init_res_value = static_cast<T>(*input_ptr);
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        init_res_value = static_cast<T>(1.f);
+                        break;
+                    }
+                    default:
+                        break;
+                }
+                auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+                uint32x4x4_t vec_res_idx{{0}};
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
+                    {
+                        case ReductionOperation::SUM_SQUARE:
+                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                            break;
+                        case ReductionOperation::MEAN_SUM:
+                        case ReductionOperation::SUM:
+                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::PROD:
+                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
+                }
+
+                switch (op)
+                {
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    case ReductionOperation::SUM_SQUARE:
+                    {
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+                        auto res = static_cast<T>(0.f);
+                        for (int i = 0; i < S; ++i)
+                        {
+                            res += wrapper::vgetlane(vec_res_value, i);
+                        }
+#else  // ARM_COMPUTE_DEBUG_ENABLED
+                        auto carry_res =
+                            wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        for (int i = 0; i < S / 4; ++i)
+                        {
+                            carry_res = wrapper::vpadd(carry_res, carry_res);
+                        }
+                        auto res = wrapper::vgetlane(carry_res, 0);
+#endif // ARM_COMPUTE_DEBUG_ENABLED
+                        if (op == ReductionOperation::SUM_SQUARE)
+                        {
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += (*(input_ptr + x)) * (*(input_ptr + x));
+                            }
+                        }
+                        else
+                        {
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += *(input_ptr + x);
+                            }
+                        }
+
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            res /= input_dim_0;
+                        }
+
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        auto carry_res =
+                            wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        T res = 1;
+                        for (int i = 0; i < S / 2; ++i)
+                        {
+                            res *= wrapper::vgetlane(carry_res, i);
+                        }
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res *= *(input_ptr + x);
+                        }
+
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
+                    }
+                    case ReductionOperation::MIN:
+                    {
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
+                    case ReductionOperation::MAX:
+                    {
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+            },
+            input, output);
+    }
+};
+
+template <typename T>
+struct RedOpX_quantized
+{
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    {
+        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+
+        const auto oq_info = out->info()->quantization_info().uniform();
+
+        const TensorInfo              in_info = *(in->info());
+        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
+
+        const int  window_step_x  = 16 / sizeof(T);
+        const auto window_start_x = static_cast<int>(in_window.x().start());
+        const auto window_end_x   = static_cast<int>(in_window.x().end());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_window);
+
+        const auto  in_offset = static_cast<float>(iq_info.offset);
+        const float in_scale  = iq_info.scale;
+
+        const auto  out_offset = static_cast<float>(oq_info.offset);
+        const float out_scale  = oq_info.scale;
+
+        const auto num_elements = static_cast<float>(in_info.dimension(0));
+
+        const float A = in_scale / (out_scale * num_elements);
+        const float B = out_offset - (in_scale * in_offset) / (out_scale);
+
+        execute_window_loop(
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+                auto vec_res_value1 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value2 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value3 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value4 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+
+                auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+                typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0};
+
+                if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN ||
+                    op == ReductionOperation::MIN || op == ReductionOperation::MAX)
+                {
+                    vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
+                }
+
+                uint32x4x4_t vec_res_idx{{0}};
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
+                    {
+                        case ReductionOperation::SUM:
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+                            const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
+
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                            //de-quantize vec_elements
+                            temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                            vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+                            vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+                            vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+                            vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
+                }
+
+                switch (op)
+                {
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
+                    }
+                    case ReductionOperation::MIN:
+                    {
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
+                    case ReductionOperation::MAX:
+                    {
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
+
+                        float res = wrapper::vgetlane(carry_res, 0);
+                        res *= wrapper::vgetlane(carry_res, 1);
+                        res *= wrapper::vgetlane(carry_res, 2);
+                        res *= wrapper::vgetlane(carry_res, 3);
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            //de-quantize input
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+                            }
+                            else
+                            {
+                                res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+                            }
+                        }
+
+                        //re-quantize result
+                        if (std::is_same<T, uint8_t>::value)
+                        {
+                            res = quantize_qasymm8(res, iq_info);
+                        }
+                        else
+                        {
+                            res = quantize_qasymm8_signed(res, iq_info);
+                        }
+
+                        *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+                        break;
+                    }
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    {
+                        auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value4);
+
+                        auto carry_paddition =
+                            wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+                        carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+                        auto res        = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res += *(input_ptr + x);
+                        }
+
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            const int32_t resFinal = A * (static_cast<float>(res)) + B;
+
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
+                        }
+                        else
+                        {
+                            // Subtract accumulated offsets
+                            res -= (in_info.dimension(0) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                        }
+
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+            },
+            input, output);
+    }
+};
+
+template <typename T, int S>
+struct RedOpYZW
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
+
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
+    {
+        const TensorInfo in_info            = *(in->info());
+        const int        window_step_x      = 16 / sizeof(T);
+        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
+        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
+        // As it split over x-axis, need to set the correct spiltted window start and end.
+        const auto window_start_x = static_cast<int>(0);
+        const auto window_end_x   = static_cast<int>(in_window.shape().x());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+        Window out_win_no_pad = out_window;
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_win_no_pad);
+
+        execute_window_loop(
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    neon_vector vec_res_value = {0};
+                    switch (op)
+                    {
+                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vloadq(input_ptr + x);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+                            break;
+                        }
+                        default:
+                        {
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                            break;
+                        }
+                    }
+                    uint32x4x4_t vec_res_idx{{0}};
+
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                                break;
+                            case ReductionOperation::PROD:
+                                vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    if (op == ReductionOperation::MEAN_SUM)
+                    {
+                        auto vec_width_inv =
+                            wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+                        vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
+                    }
+
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                    {
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                        if (std::is_same<T, float16_t>::value)
+                        {
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+                        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    }
+                    else
+                    {
+                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
+                    }
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    auto res_value = 0.f;
+                    switch (op)
+                    {
+                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
+                        {
+                            res_value = *(input_ptr + x);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            res_value = static_cast<T>(1.f);
+                            break;
+                        }
+                        default:
+                        {
+                            res_value = static_cast<T>(0.f);
+                            break;
+                        }
+                    }
+
+                    uint32_t res_idx = 0;
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                res_value += *in_ptr;
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                res_value += *in_ptr * *in_ptr;
+                                break;
+                            case ReductionOperation::PROD:
+                                res_value *= *in_ptr;
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    if (op == ReductionOperation::MEAN_SUM)
+                    {
+                        res_value /= in_info.dimension(axis);
+                    }
+
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                    {
+                        *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                    }
+                }
+            },
+            input, output);
+    }
+};
+
+template <typename T, int S, int axis, ReductionOperation op>
+struct RedOpYZW_complex
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
+
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
+    {
+        ARM_COMPUTE_ERROR_ON(axis != 2);
+        ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
+
+        const TensorInfo in_info            = *(in->info());
+        const size_t     stride_z           = in_info.strides_in_bytes()[axis];
+        const int        window_step_x      = 16 / sizeof(T);
+        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
+        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
+        // As it split over x-axis, need to set the correct spiltted window start and end.
+        const auto window_start_x = static_cast<int>(0);
+        const auto window_end_x   = static_cast<int>(in_window.shape().x());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+        Window out_win_no_pad = out_window;
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_win_no_pad);
+
+        execute_window_loop(
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    neon_vector vec_res_value_0 = {0};
+                    neon_vector vec_res_value_1 = {0};
+
+                    vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                    vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+
+                        const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+                        const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+
+                        vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+                        vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+                    }
+
+                    wrapper::vstore(out_ptr, vec_res_value_0);
+                    wrapper::vstore(out_ptr + 4, vec_res_value_1);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    auto res_value_0 = 0.f;
+                    auto res_value_1 = 0.f;
+
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        res_value_0 += *in_ptr;
+                        res_value_1 += *(in_ptr + 1);
+                    }
+                    *out_ptr       = res_value_0;
+                    *(out_ptr + 1) = res_value_1;
+                }
+            },
+            input, output);
+    }
+};
+
+template <typename T>
+struct RedOpYZW_quantized
+{
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
+    {
+        const TensorInfo              in_info = *(in->info());
+        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
+        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+
+        const auto oq_info = out->info()->quantization_info().uniform();
+
+        const int  window_step_x      = 16 / sizeof(T);
+        const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
+        const auto window_end_x_tmp   = static_cast<int>(in_window.x().end());
+        // As it split over x-axis, need to set the correct spiltted window start and end.
+        const auto window_start_x = static_cast<int>(0);
+        const auto window_end_x   = static_cast<int>(in_window.shape().x());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+        Window out_win_no_pad = out_window;
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_win_no_pad);
+
+        using vector_type =
+            typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
+        using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
+
+        vector_type vec_res_value1{};
+        vector_type vec_res_value2{};
+        vector_type vec_res_value3{};
+        vector_type vec_res_value4{};
+
+        vector_type_f vec_res_value1_f{};
+        vector_type_f vec_res_value2_f{};
+        vector_type_f vec_res_value3_f{};
+        vector_type_f vec_res_value4_f{};
+
+        const float in_offset = static_cast<float>(iq_info.offset);
+        const float in_scale  = iq_info.scale;
+
+        const float out_offset = static_cast<float>(oq_info.offset);
+        const float out_scale  = oq_info.scale;
+
+        const float num_elements = static_cast<float>(in_info.dimension(axis));
+
+        const float A = in_scale / (out_scale * num_elements);
+        const float B = out_offset - (in_scale * in_offset) / (out_scale);
+
+        const auto vec_A = wrapper::vdup_n(static_cast<float>(A), wrapper::traits::vector_128_tag{});
+        const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{});
+
+        execute_window_loop(
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    uint32x4x4_t vec_res_idx{{0}};
+                    vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+
+                    vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+
+                    auto vec_res_value = wrapper::vloadq(input_ptr + x);
+
+                    for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
+                    {
+                        const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                            {
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                                vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                                vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                                vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                                break;
+                            }
+                            case ReductionOperation::PROD:
+                            {
+                                const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset),
+                                                                           wrapper::traits::vector_128_tag{});
+                                const auto scale32x4f_4 =
+                                    wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                                auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                                auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                                auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                                //de-quantize vec_elements
+                                temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                                vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
+                                vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
+                                vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
+                                vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    switch (op)
+                    {
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12,
+                                            vec_res_idx.val[3]);
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
+                        {
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::SUM:
+                        {
+                            // Subtract offsets
+                            auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
+
+                            auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
+                            auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
+                            auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
+                            auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
+
+                            vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
+                            vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
+                            vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
+                            vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
+
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
+
+                            combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
+                            break;
+                        }
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
+                            vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
+                            vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
+                            vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
+
+#ifdef __aarch64__
+                            vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+#else  // defined(__aarch64__)
+                            vec_res_value1    = wrapper::vcvt<PromotedType>(vec_res_value1_f);
+                            vec_res_value2    = wrapper::vcvt<PromotedType>(vec_res_value2_f);
+                            vec_res_value3    = wrapper::vcvt<PromotedType>(vec_res_value3_f);
+                            vec_res_value4    = wrapper::vcvt<PromotedType>(vec_res_value4_f);
+#endif // __aarch64__
+
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            const auto offset32x4f_4 =
+                                wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
+                            const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
+
+                            //re-quantize
+                            vec_res_value1_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value2_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value3_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value4_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
+
+                            vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    float   res_value   = 0.f;
+                    int32_t res_value_q = 0;
+
+                    switch (op)
+                    {
+                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
+                        {
+                            res_value = *(input_ptr + x);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            res_value = static_cast<T>(1.0f);
+                            break;
+                        }
+                        default:
+                        {
+                            res_value = static_cast<T>(0.0f);
+                            break;
+                        }
+                    }
+                    uint32_t res_idx = 0;
+
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            {
+                                res_value += *in_ptr;
+                                break;
+                            }
+                            case ReductionOperation::MEAN_SUM:
+                            {
+                                res_value_q += *in_ptr;
+                                break;
+                            }
+                            case ReductionOperation::SUM_SQUARE:
+                            {
+                                res_value += *in_ptr * *in_ptr;
+                                break;
+                            }
+                            case ReductionOperation::PROD:
+                            {
+                                //de-quantize input
+                                if (std::is_same<T, uint8_t>::value)
+                                {
+                                    res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                                }
+                                else
+                                {
+                                    res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    switch (op)
+                    {
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                        // Apply previously calculated coefficients (with rounding on aarch64)
+#ifdef __aarch64__
+                            const int32_t res =
+                                arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+#else  // defined(__aarch64__)
+                            const int32_t res = A * (static_cast<float>(res_value_q)) + B;
+#endif // __aarch64__
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
+                            break;
+                        }
+                        case ReductionOperation::SUM:
+                        {
+                            // Subtract accumulated offsets
+                            res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            //re-quantize result
+                            T res = 0;
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res = quantize_qasymm8(res_value, iq_info);
+                            }
+                            else
+                            {
+                                res = quantize_qasymm8_signed(res_value, iq_info);
+                            }
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
+                            break;
+                        }
+                        default:
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
+                    }
+                }
+            },
+            input, output);
+    }
+};
+
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp b/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp
new file mode 100644
index 0000000000..ad66b456ac
--- /dev/null
+++ b/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void reduce_RedOpX_reduceX_S32_4(const Window            &window,
+                                 const ITensor           *input,
+                                 ITensor                 *output,
+                                 const ReductionOperation op)
+{
+    return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
+}
+
+void reduce_RedOpYZW_reduceY_S32_4(const Window            &window,
+                                   const ITensor           *input,
+                                   ITensor                 *output,
+                                   const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op);
+}
+void reduce_RedOpYZW_reduceZ_S32_4(const Window            &window,
+                                   const ITensor           *input,
+                                   ITensor                 *output,
+                                   const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op);
+}
+
+void reduce_RedOpYZW_reduceW_S32_4(const Window            &window,
+                                   const ITensor           *input,
+                                   ITensor                 *output,
+                                   const ReductionOperation op)
+{
+    return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/list.h b/src/cpu/kernels/reduction_layer/generic/neon/list.h
new file mode 100644
index 0000000000..947c28a130
--- /dev/null
+++ b/src/cpu/kernels/reduction_layer/generic/neon/list.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#define DECLARE_REDUCTION_KERNEL(func_name) \
+    void func_name(const Window &window, const ITensor *in, ITensor *out, const ReductionOperation op)
+
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_float32_4);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_float32_4);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_float32_4);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_float32_4);
+
+DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_float16_8);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_float16_8);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_float16_8);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_float16_8);
+
+DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_S32_4);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_S32_4);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_S32_4);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_S32_4);
+
+DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_qasymm8);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_qasymm8);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_qasymm8);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_qasymm8);
+
+DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_qasymm8_signed);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_qasymm8_signed);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_qasymm8_signed);
+DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_qasymm8_signed);
+
+#undef DECLARE_REDUCTION_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..bc711c6855
--- /dev/null
+++ b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void reduce_RedOpX_reduceX_qasymm8(const Window            &window,
+                                   const ITensor           *input,
+                                   ITensor                 *output,
+                                   const ReductionOperation op)
+{
+    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
+}
+
+void reduce_RedOpYZW_reduceY_qasymm8(const Window            &window,
+                                     const ITensor           *input,
+                                     ITensor                 *output,
+                                     const ReductionOperation op)
+{
+    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+}
+
+void reduce_RedOpYZW_reduceZ_qasymm8(const Window            &window,
+                                     const ITensor           *input,
+                                     ITensor                 *output,
+                                     const ReductionOperation op)
+{
+    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+}
+
+void reduce_RedOpYZW_reduceW_qasymm8(const Window            &window,
+                                     const ITensor           *input,
+                                     ITensor                 *output,
+                                     const ReductionOperation op)
+{
+    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..10ac3d6715
--- /dev/null
+++ b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void reduce_RedOpX_reduceX_qasymm8_signed(const Window            &window,
+                                          const ITensor           *input,
+                                          ITensor                 *output,
+                                          const ReductionOperation op)
+{
+    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
+}
+
+void reduce_RedOpYZW_reduceY_qasymm8_signed(const Window            &window,
+                                            const ITensor           *input,
+                                            ITensor                 *output,
+                                            const ReductionOperation op)
+{
+    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+}
+
+void reduce_RedOpYZW_reduceZ_qasymm8_signed(const Window            &window,
+                                            const ITensor           *input,
+                                            ITensor                 *output,
+                                            const ReductionOperation op)
+{
+    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+}
+
+void reduce_RedOpYZW_reduceW_qasymm8_signed(const Window            &window,
+                                            const ITensor           *input,
+                                            ITensor                 *output,
+                                            const ReductionOperation op)
+{
+    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..cf99830562
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
+{
+    return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..c1dba99b5e
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
+{
+    return roi_align<float, float>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h
new file mode 100644
index 0000000000..db2f67705d
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/impl.h
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+/** Average pooling over an aligned window */
+template <typename input_data_type>
+inline input_data_type roi_align_1x1(const ITensor *input,
+                                     unsigned int   roi_batch,
+                                     float          region_start_x,
+                                     float          bin_size_x,
+                                     int            grid_size_x,
+                                     float          region_end_x,
+                                     float          region_start_y,
+                                     float          bin_size_y,
+                                     int            grid_size_y,
+                                     float          region_end_y,
+                                     int            pz)
+{
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return input_data_type(0);
+    }
+    else
+    {
+        const DataLayout data_layout = input->info()->data_layout();
+        float            avg         = 0;
+        // Iterate through the aligned pooling region
+        for (int iy = 0; iy < grid_size_y; ++iy)
+        {
+            for (int ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
+                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int y_low  = y;
+                const int x_low  = x;
+                const int y_high = y_low + 1;
+                const int x_high = x_low + 1;
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1. - ly;
+                const float hx = 1. - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+                if (data_layout == DataLayout::NCHW)
+                {
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
+                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                }
+                else
+                {
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
+                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                }
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+        return input_data_type(avg);
+    }
+}
+
+/** Average pooling over an aligned window */
+template <typename input_data_type>
+inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
+                                             unsigned int            roi_batch,
+                                             float                   region_start_x,
+                                             float                   bin_size_x,
+                                             int                     grid_size_x,
+                                             float                   region_end_x,
+                                             float                   region_start_y,
+                                             float                   bin_size_y,
+                                             int                     grid_size_y,
+                                             float                   region_end_y,
+                                             int                     pz,
+                                             const QuantizationInfo &out_qinfo)
+{
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return input_data_type(out_qinfo.uniform().offset);
+    }
+    else
+    {
+        float                         avg         = 0;
+        const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
+        const bool       is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
+        const DataLayout data_layout      = input->info()->data_layout();
+
+        // Iterate through the aligned pooling region
+        for (int iy = 0; iy < grid_size_y; ++iy)
+        {
+            for (int ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
+                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int y_low  = y;
+                const int x_low  = x;
+                const int y_high = y_low + 1;
+                const int x_high = x_low + 1;
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1. - ly;
+                const float hx = 1. - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+
+                if (data_layout == DataLayout::NCHW)
+                {
+                    if (is_qasymm_signed)
+                    {
+                        float data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_high, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_high, pz, roi_batch))),
+                                                      input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        float data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))),
+                                               input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))),
+                                               input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                }
+                else
+                {
+                    if (is_qasymm_signed)
+                    {
+                        const auto data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_high, roi_batch))),
+                                                      input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_high, roi_batch))),
+                                                      input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        const auto data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))),
+                                               input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))),
+                                               input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                }
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+
+        input_data_type res = 0;
+        if (is_qasymm_signed)
+        {
+            res = quantize_qasymm8_signed(avg, out_qinfo);
+        }
+        else
+        {
+            res = quantize_qasymm8(avg, out_qinfo);
+        }
+        return res;
+    }
+}
+inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
+{
+    const float region_start = p * bin_size + roi_anchor;
+    return utility::clamp(region_start, 0.0f, max_value);
+}
+
+template <typename input_data_type, typename roi_data_type>
+void roi_align(const ITensor      *input,
+               ITensor            *output,
+               const ITensor      *rois,
+               ROIPoolingLayerInfo pool_info,
+               const Window       &window,
+               const ThreadInfo   &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    const DataLayout data_layout    = input->info()->data_layout();
+    const size_t     values_per_roi = rois->info()->dimension(0);
+
+    const int roi_list_start = window.x().start();
+    const int roi_list_end   = window.x().end();
+
+    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const int input_width   = input->info()->dimension(idx_width);
+    const int input_height  = input->info()->dimension(idx_height);
+    const int input_chanels = input->info()->dimension(idx_depth);
+    const int pooled_w      = pool_info.pooled_width();
+    const int pooled_h      = pool_info.pooled_height();
+
+    const DataType data_type = input->info()->data_type();
+    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
+
+    const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(rois->buffer());
+    const QuantizationInfo &rois_qinfo = rois->info()->quantization_info();
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    {
+        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
+
+        roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
+        roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
+        roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
+        roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
+        float         x1(qx1);
+        float         x2(qx2);
+        float         y1(qy1);
+        float         y2(qy2);
+        if (is_qasymm)
+        {
+            x1 = dequantize_qasymm16(qx1, rois_qinfo);
+            x2 = dequantize_qasymm16(qx2, rois_qinfo);
+            y1 = dequantize_qasymm16(qy1, rois_qinfo);
+            y2 = dequantize_qasymm16(qy2, rois_qinfo);
+        }
+        const float roi_anchor_x = x1 * pool_info.spatial_scale();
+        const float roi_anchor_y = y1 * pool_info.spatial_scale();
+        const float roi_dims_x   = std::max((x2 - x1) * pool_info.spatial_scale(), 1.0f);
+        const float roi_dims_y   = std::max((y2 - y1) * pool_info.spatial_scale(), 1.0f);
+        float       bin_size_x   = roi_dims_x / pool_info.pooled_width();
+        float       bin_size_y   = roi_dims_y / pool_info.pooled_height();
+
+        // Iterate through all feature maps
+        for (int ch = 0; ch < input_chanels; ++ch)
+        {
+            // Iterate through all output pixels
+            for (int py = 0; py < pooled_h; ++py)
+            {
+                for (int px = 0; px < pooled_w; ++px)
+                {
+                    const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
+                    const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
+                    const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
+                    const float region_end_y =
+                        compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
+                    const int roi_bin_grid_x =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
+                    const int roi_bin_grid_y =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
+                    input_data_type out_val(0);
+                    if (is_qasymm)
+                    {
+                        out_val = roi_align_1x1_qasymm8<input_data_type>(
+                            input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y,
+                            bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
+                    }
+                    else
+                    {
+                        out_val = roi_align_1x1<input_data_type>(input, roi_batch, region_start_x, bin_size_x,
+                                                                 roi_bin_grid_x, region_end_x, region_start_y,
+                                                                 bin_size_y, roi_bin_grid_y, region_end_y, ch);
+                    }
+
+                    if (data_layout == DataLayout::NCHW)
+                    {
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
+                        *out_ptr = out_val;
+                    }
+                    else
+                    {
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
+                        *out_ptr = out_val;
+                    }
+                }
+            }
+        }
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..11c5770f53
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
+{
+    return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..7f93cc87b3
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qs8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
+{
+    return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h
new file mode 100644
index 0000000000..fdb3c0050d
--- /dev/null
+++ b/src/cpu/kernels/roialign/list.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H
+#define SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ROIALIGN_KERNEL(func_name)                                                                    \
+    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \
+                   const Window &window, const ThreadInfo &info)
+DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign);
+DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign);
+DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign);
+DECLARE_ROIALIGN_KERNEL(neon_qs8_roialign);
+#undef DECLARE_ROIALIGN_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H */
diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp
new file mode 100644
index 0000000000..c8a7b7038e
--- /dev/null
+++ b/src/cpu/kernels/scale/neon/fp16.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/cpu/kernels/scale/neon/list.h"
+#include "support/Rounding.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+void fp16_neon_scale_nearest(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 8;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int        offset_row = in_hi * in_stride_wc;
+            int32_t          x          = window_start_x;
+            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
+}
+
+void fp16_neon_scale_bilinear(const ITensor *src,
+                              ITensor       *dst,
+                              const ITensor *offsets,
+                              const ITensor *dx,
+                              const ITensor *dy,
+                              BorderMode     border_mode,
+                              PixelValue     constant_border_value,
+                              float          sampling_offset,
+                              bool           align_corners,
+                              const Window  &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    Iterator  out(dst, window);
+    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const int in_dim_w     = src->info()->dimension(1);
+    const int in_dim_h     = src->info()->dimension(2);
+    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
+
+    // Don't increment in Y and Z direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(src, win_in);
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+        using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
+
+        const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t    in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const float16_t *in_ptr =
+                    reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+                auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+namespace cpu
+{
+#ifdef ENABLE_NCHW_KERNELS
+void fp16_bilinear_neon_scale_nchw(const ITensor      *src,
+                                   ITensor            *dst,
+                                   const ITensor      *offsets,
+                                   const ITensor      *dx,
+                                   const ITensor      *dy,
+                                   InterpolationPolicy policy,
+                                   BorderMode          border_mode,
+                                   PixelValue          constant_border_value,
+                                   float               sampling_offset,
+                                   bool                align_corners,
+                                   const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    arm_compute::cpu::scale_bilinear_nchw<float16_t>(src, dst, dx, dy, offsets, border_mode, constant_border_value,
+                                                     sampling_offset, align_corners, window);
+}
+
+void fp16_nearest_neon_scale_nchw(const ITensor      *src,
+                                  ITensor            *dst,
+                                  const ITensor      *offsets,
+                                  const ITensor      *dx,
+                                  const ITensor      *dy,
+                                  InterpolationPolicy policy,
+                                  BorderMode          border_mode,
+                                  PixelValue          constant_border_value,
+                                  float               sampling_offset,
+                                  bool                align_corners,
+                                  const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_UNUSED(border_mode);
+    arm_compute::cpu::scale_nearest_nchw<float16_t>(src, dst, dx, dy, offsets, constant_border_value, sampling_offset,
+                                                    align_corners, window);
+}
+#endif // ENABLE_NCHW_KERNELS
+void fp16_neon_scale(const ITensor      *src,
+                     ITensor            *dst,
+                     const ITensor      *offsets,
+                     const ITensor      *dx,
+                     const ITensor      *dy,
+                     InterpolationPolicy policy,
+                     BorderMode          border_mode,
+                     PixelValue          constant_border_value,
+                     float               sampling_offset,
+                     bool                align_corners,
+                     const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
+    {
+        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                 align_corners, window);
+    }
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+
+void fp16_common_neon_scale(const ITensor      *src,
+                            ITensor            *dst,
+                            const ITensor      *offsets,
+                            const ITensor      *dx,
+                            const ITensor      *dy,
+                            InterpolationPolicy policy,
+                            BorderMode          border_mode,
+                            PixelValue          constant_border_value,
+                            float               sampling_offset,
+                            bool                align_corners,
+                            const Window       &window)
+{
+    arm_compute::cpu::common_neon_scale<float16_t>(src, dst, offsets, dx, dy, policy, border_mode,
+                                                   constant_border_value, sampling_offset, align_corners, window);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp
new file mode 100644
index 0000000000..bbf92e0412
--- /dev/null
+++ b/src/cpu/kernels/scale/neon/integer.cpp
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+void u8_neon_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 16;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
+}
+
+void u8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
+{
+    // Compute the ratio between source and destination dimensions
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    const int input_width  = src->info()->dimension(1);
+    const int input_height = src->info()->dimension(2);
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+        Iterator  out(dst, window);
+        const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+        const int in_stride_wc =
+            in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
+
+        // Don't increment in Y and Z direction for the input tensor
+        // A pointer to the start of this plane is needed as base for the precomputed offsets
+        Window win_in(window);
+        win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        Iterator in(src, win_in);
+
+        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
+                const uint8_t *in_ptr =
+                    reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height)
+                                     ? *in_ptr
+                                     : const_border_value;
+                const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<uint8_t *>(out.ptr()) =
+                    static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
+
+        const int in_stride_x  = src->info()->strides_in_bytes()[1];
+        const int in_stride_y  = src->info()->strides_in_bytes()[2];
+        const int in_stride_b  = src->info()->strides_in_bytes()[3];
+        const int out_stride_x = dst->info()->strides_in_bytes()[1];
+        const int out_stride_y = dst->info()->strides_in_bytes()[2];
+        const int out_stride_b = dst->info()->strides_in_bytes()[3];
+
+        const int     out_dim_ch = dst->info()->dimension(0);
+        constexpr int step_cout  = 16;
+
+        Window window_execution = window;
+        window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Window win_in_out(window);
+        win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        Iterator in(src, win_in_out);
+        Iterator out(dst, win_in_out);
+
+        const int xo_start = window_execution[1].start();
+        const int xo_end   = window_execution[1].end();
+        const int xo_step  = window_execution[1].step();
+        const int yo_start = window_execution[2].start();
+        const int yo_end   = window_execution[2].end();
+        const int yo_step  = window_execution[2].step();
+        const int bo_start = window_execution[3].start();
+        const int bo_end   = window_execution[3].end();
+        const int bo_step  = window_execution[3].step();
+
+        const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
+        const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
+
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
+        {
+            const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
+            uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
+
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
+            {
+                // Floating-point coordinate
+                const float yi_f = yo * scale_y + fp_coord_offset_y;
+                // Integer coordinate
+                const int yi = static_cast<int>(std::floor(yi_f));
+                // Weight for the y coordinate
+                const float a1 = (yi_f - static_cast<float>(yi));
+                const float b1 = (1.f - a1);
+
+                const int yi0 = utility::clamp<int>(yi, 0, input_height - 1);
+                const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1);
+
+                const uint8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y;
+                const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
+
+                uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
+                {
+                    // Floating-point coordinate
+                    const float xi_f = xo * scale_x + fp_coord_offset_x;
+                    // Integer coordinate
+                    const int xi = static_cast<int>(std::floor(xi_f));
+                    // Weight for the x coordinate
+                    const float a = (xi_f - static_cast<float>(xi));
+                    const float b = (1.f - a);
+
+                    const float s00_s = b * b1;
+                    const float s01_s = a * b1;
+                    const float s10_s = b * a1;
+                    const float s11_s = a * a1;
+
+                    const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
+                    const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
+                    const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
+                    const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
+
+                    const int xi0 = utility::clamp<int>(xi, 0, input_width - 1);
+                    const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1);
+
+                    const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x;
+                    const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x;
+
+                    uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
+
+                    int cout = 0;
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    {
+                        const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
+                        const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
+                        const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(uint8_t));
+                        const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(uint8_t));
+
+                        const uint16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
+                        const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
+
+                        const auto in00_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_low)));
+                        const auto in00_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_low)));
+                        const auto in00_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_high)));
+                        const auto in00_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_high)));
+
+                        const uint16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
+                        const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
+
+                        const auto in01_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_low)));
+                        const auto in01_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_low)));
+                        const auto in01_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_high)));
+                        const auto in01_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_high)));
+
+                        const uint16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
+                        const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
+
+                        const auto in10_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_low)));
+                        const auto in10_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_low)));
+                        const auto in10_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_high)));
+                        const auto in10_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_high)));
+
+                        const uint16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
+                        const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
+
+                        const auto in11_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_low)));
+                        const auto in11_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_low)));
+                        const auto in11_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_high)));
+                        const auto in11_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_high)));
+
+                        auto out_0 = wrapper::vmul(in00_0, s00);
+                        out_0      = wrapper::vmla(out_0, in01_0, s01);
+                        out_0      = wrapper::vmla(out_0, in10_0, s10);
+                        out_0      = wrapper::vmla(out_0, in11_0, s11);
+
+                        auto out_1 = wrapper::vmul(in00_1, s00);
+                        out_1      = wrapper::vmla(out_1, in01_1, s01);
+                        out_1      = wrapper::vmla(out_1, in10_1, s10);
+                        out_1      = wrapper::vmla(out_1, in11_1, s11);
+
+                        auto out_2 = wrapper::vmul(in00_2, s00);
+                        out_2      = wrapper::vmla(out_2, in01_2, s01);
+                        out_2      = wrapper::vmla(out_2, in10_2, s10);
+                        out_2      = wrapper::vmla(out_2, in11_2, s11);
+
+                        auto out_3 = wrapper::vmul(in00_3, s00);
+                        out_3      = wrapper::vmla(out_3, in01_3, s01);
+                        out_3      = wrapper::vmla(out_3, in10_3, s10);
+                        out_3      = wrapper::vmla(out_3, in11_3, s11);
+
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int = wrapper::vcvta<uint32_t>(out_0);
+                        const auto out_1_int = wrapper::vcvta<uint32_t>(out_1);
+                        const auto out_2_int = wrapper::vcvta<uint32_t>(out_2);
+                        const auto out_3_int = wrapper::vcvta<uint32_t>(out_3);
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int                      = wrapper::vcvt<uint32_t>(out_0);
+                        const auto out_1_int                      = wrapper::vcvt<uint32_t>(out_1);
+                        const auto out_2_int                      = wrapper::vcvt<uint32_t>(out_2);
+                        const auto out_3_int                      = wrapper::vcvt<uint32_t>(out_3);
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
+
+                        wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
+                    }
+
+                    for (; cout < out_dim_ch; ++cout)
+                    {
+                        const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
+                        const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
+                        const uint8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(uint8_t));
+                        const uint8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(uint8_t));
+
+                        float out0 = in00 * s00_s;
+                        out0 += in01 * s01_s;
+                        out0 += in10 * s10_s;
+                        out0 += in11 * s11_s;
+
+                        // Rounding modes of vector and scalar loops should match
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = static_cast<uint8_t>(std::round(out0));
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = static_cast<uint8_t>(out0);
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+void s8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value);
+    if (border_mode == BorderMode::REPLICATE)
+    {
+        using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
+
+        // Compute the ratio between source and destination dimensions
+        const float scale_x =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+        const float scale_y =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+        const int     in_stride_x  = src->info()->strides_in_bytes()[1];
+        const int     in_stride_y  = src->info()->strides_in_bytes()[2];
+        const int     in_stride_b  = src->info()->strides_in_bytes()[3];
+        const int     out_stride_x = dst->info()->strides_in_bytes()[1];
+        const int     out_stride_y = dst->info()->strides_in_bytes()[2];
+        const int     out_stride_b = dst->info()->strides_in_bytes()[3];
+        const int     input_width  = src->info()->dimension(1);
+        const int     input_height = src->info()->dimension(2);
+        const int     out_dim_ch   = dst->info()->dimension(0);
+        constexpr int step_cout    = 16;
+
+        Window window_execution = window;
+        window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Window win_in_out(window);
+        win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        Iterator in(src, win_in_out);
+        Iterator out(dst, win_in_out);
+
+        const int xo_start = window_execution[1].start();
+        const int xo_end   = window_execution[1].end();
+        const int xo_step  = window_execution[1].step();
+        const int yo_start = window_execution[2].start();
+        const int yo_end   = window_execution[2].end();
+        const int yo_step  = window_execution[2].step();
+        const int bo_start = window_execution[3].start();
+        const int bo_end   = window_execution[3].end();
+        const int bo_step  = window_execution[3].step();
+
+        const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
+        const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
+
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
+        {
+            const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
+            int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
+
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
+            {
+                // Floating-point coordinate
+                const float yi_f = yo * scale_y + fp_coord_offset_y;
+                // Integer coordinate
+                const int yi = static_cast<int>(std::floor(yi_f));
+                // Weight for the y coordinate
+                const float a1 = (yi_f - static_cast<float>(yi));
+                const float b1 = (1.f - a1);
+
+                const int yi0 = utility::clamp<int>(yi, 0, input_height - 1);
+                const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1);
+
+                const int8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y;
+                const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
+
+                int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
+                {
+                    // Floating-point coordinate
+                    const float xi_f = xo * scale_x + fp_coord_offset_x;
+                    // Integer coordinate
+                    const int xi = static_cast<int>(std::floor(xi_f));
+                    // Weight for the x coordinate
+                    const float a = (xi_f - static_cast<float>(xi));
+                    const float b = (1.f - a);
+
+                    const float s00_s = b * b1;
+                    const float s01_s = a * b1;
+                    const float s10_s = b * a1;
+                    const float s11_s = a * a1;
+
+                    const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
+                    const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
+                    const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
+                    const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
+
+                    const int xi0 = utility::clamp<int>(xi, 0, input_width - 1);
+                    const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1);
+
+                    const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x;
+                    const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x;
+
+                    int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
+
+                    int cout = 0;
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    {
+                        const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
+                        const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
+                        const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(int8_t));
+                        const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(int8_t));
+
+                        const int16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
+                        const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
+
+                        const auto in00_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_low)));
+                        const auto in00_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_low)));
+                        const auto in00_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_high)));
+                        const auto in00_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_high)));
+
+                        const int16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
+                        const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
+
+                        const auto in01_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_low)));
+                        const auto in01_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_low)));
+                        const auto in01_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_high)));
+                        const auto in01_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_high)));
+
+                        const int16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
+                        const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
+
+                        const auto in10_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_low)));
+                        const auto in10_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_low)));
+                        const auto in10_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_high)));
+                        const auto in10_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_high)));
+
+                        const int16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
+                        const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
+
+                        const auto in11_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_low)));
+                        const auto in11_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_low)));
+                        const auto in11_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_high)));
+                        const auto in11_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_high)));
+
+                        auto out_0 = wrapper::vmul(in00_0, s00);
+                        out_0      = wrapper::vmla(out_0, in01_0, s01);
+                        out_0      = wrapper::vmla(out_0, in10_0, s10);
+                        out_0      = wrapper::vmla(out_0, in11_0, s11);
+
+                        auto out_1 = wrapper::vmul(in00_1, s00);
+                        out_1      = wrapper::vmla(out_1, in01_1, s01);
+                        out_1      = wrapper::vmla(out_1, in10_1, s10);
+                        out_1      = wrapper::vmla(out_1, in11_1, s11);
+
+                        auto out_2 = wrapper::vmul(in00_2, s00);
+                        out_2      = wrapper::vmla(out_2, in01_2, s01);
+                        out_2      = wrapper::vmla(out_2, in10_2, s10);
+                        out_2      = wrapper::vmla(out_2, in11_2, s11);
+
+                        auto out_3 = wrapper::vmul(in00_3, s00);
+                        out_3      = wrapper::vmla(out_3, in01_3, s01);
+                        out_3      = wrapper::vmla(out_3, in10_3, s10);
+                        out_3      = wrapper::vmla(out_3, in11_3, s11);
+
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int = wrapper::vcvta<int32_t>(out_0);
+                        const auto out_1_int = wrapper::vcvta<int32_t>(out_1);
+                        const auto out_2_int = wrapper::vcvta<int32_t>(out_2);
+                        const auto out_3_int = wrapper::vcvta<int32_t>(out_3);
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int                      = wrapper::vcvt<int32_t>(out_0);
+                        const auto out_1_int                      = wrapper::vcvt<int32_t>(out_1);
+                        const auto out_2_int                      = wrapper::vcvt<int32_t>(out_2);
+                        const auto out_3_int                      = wrapper::vcvt<int32_t>(out_3);
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
+
+                        wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
+                    }
+
+                    for (; cout < out_dim_ch; ++cout)
+                    {
+                        const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
+                        const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
+                        const int8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(int8_t));
+                        const int8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(int8_t));
+
+                        float out0 = in00 * s00_s;
+                        out0 += in01 * s01_s;
+                        out0 += in10 * s10_s;
+                        out0 += in11 * s11_s;
+
+                        // Rounding modes of vector and scalar loops should match
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) = static_cast<int8_t>(std::round(out0));
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(int8_t))  = static_cast<int8_t>(out0);
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+void s16_neon_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 8;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
+}
+
+void s16_neon_scale_bilinear(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             const ITensor *dx,
+                             const ITensor *dy,
+                             BorderMode     border_mode,
+                             PixelValue     constant_border_value,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    Iterator  out(dst, window);
+    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const int in_dim_w     = src->info()->dimension(1);
+    const int in_dim_h     = src->info()->dimension(2);
+    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
+
+    // Don't increment in Y and Z direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(src, win_in);
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+        const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const int16_t *in_ptr =
+                    reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+                const auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                const auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+                const auto a00 =
+                    *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+namespace cpu
+{
+void s8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
+    {
+        s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+void u8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
+    {
+        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
+    }
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+
+void s16_neon_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
+    {
+        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                align_corners, window);
+    }
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h
new file mode 100644
index 0000000000..153dc67c3d
--- /dev/null
+++ b/src/cpu/kernels/scale/neon/list.h
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
+
+DECLARE_SCALE_KERNEL(s16_neon_scale);
+DECLARE_SCALE_KERNEL(u8_neon_scale);
+DECLARE_SCALE_KERNEL(s8_neon_scale);
+DECLARE_SCALE_KERNEL(qasymm8_neon_scale);
+DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
+DECLARE_SCALE_KERNEL(fp16_common_neon_scale);
+DECLARE_SCALE_KERNEL(fp16_bilinear_neon_scale_nchw);
+DECLARE_SCALE_KERNEL(fp16_nearest_neon_scale_nchw);
+
+#undef DECLARE_SCALE_KERNEL
+
+#ifdef ENABLE_NCHW_KERNELS
+template <typename T>
+void scale_nearest_nchw(const ITensor *src,
+                        ITensor       *dst,
+                        const ITensor *dx,
+                        const ITensor *dy,
+                        const ITensor *offsets,
+                        PixelValue     constant_border_value,
+                        float          sampling_offset,
+                        bool           align_corners,
+                        const Window  &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy);
+    ARM_COMPUTE_UNUSED(constant_border_value);
+    const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+
+    // Compute the ratio between source height and destination height
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Set offsets window
+    Window win_off;
+    win_off.set(Window::DimX, window[Window::DimX]);
+    win_off.set(Window::DimY, window[Window::DimY]);
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    // Create iterators
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+    Iterator offsets_i(offsets, win_off);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
+            const auto in_yi       = static_cast<int32_t>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.y() + sampling_offset) * hr)
+                                    : std::floor((id.y() + sampling_offset) * hr));
+            const int32_t offset_row = in_yi * in_stride_x;
+            *reinterpret_cast<T *>(dst_i.ptr()) =
+                *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
+        },
+        src_i, offsets_i, dst_i);
+}
+
+template <typename T>
+void scale_bilinear_nchw(const ITensor *src,
+                         ITensor       *dst,
+                         const ITensor *dx,
+                         const ITensor *dy,
+                         const ITensor *offsets,
+                         BorderMode     border_mode,
+                         PixelValue     constant_border_value,
+                         float          sampling_offset,
+                         bool           align_corners,
+                         const Window  &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    Window win_off;
+    win_off.set(Window::DimX, window.x());
+    win_off.set(Window::DimY, window.y());
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+    Iterator offsets_i(offsets, win_off);
+    Iterator dx_i(dx, win_off);
+    Iterator dy_i(dy, win_off);
+
+    const int32_t in_dim_w    = src->info()->dimension(0);
+    const int32_t in_dim_h    = src->info()->dimension(1);
+    const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h       = std::floor((id.y() + sampling_offset) * hr - sampling_offset);
+                const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
+
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int  index_h       = std::floor((id.y() + sampling_offset) * hr - sampling_offset);
+                const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
+                const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
+                const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
+                const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
+
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+#endif // ENABLE_NCHW_KERNELS
+
+template <typename T>
+void nearest_neon_scale(const ITensor *src,
+                        ITensor       *dst,
+                        const ITensor *offsets,
+                        float          sampling_offset,
+                        bool           align_corners,
+                        const Window  &window)
+{
+    ARM_COMPUTE_UNUSED(offsets);
+
+    // Compute the ratio between source and destination dimensions
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    const int in_stride_y  = src->info()->strides_in_bytes()[1];
+    const int in_stride_z  = src->info()->strides_in_bytes()[2];
+    const int in_stride_w  = src->info()->strides_in_bytes()[3];
+    const int out_stride_y = dst->info()->strides_in_bytes()[1];
+    const int out_stride_z = dst->info()->strides_in_bytes()[2];
+    const int out_stride_w = dst->info()->strides_in_bytes()[3];
+    const int out_dim_ch   = dst->info()->dimension(0);
+    const int step_cout    = 16 / sizeof(T);
+
+    Window window_execution = window;
+    window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Window win_in_out(window);
+    win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(src, win_in_out);
+    Iterator out(dst, win_in_out);
+
+    const int xo_start = window_execution.y().start();
+    const int xo_end   = window_execution.y().end();
+    const int xo_step  = window_execution.y().step();
+    const int yo_start = window_execution.z().start();
+    const int yo_end   = window_execution.z().end();
+    const int yo_step  = window_execution.z().step();
+    const int bo_start = window_execution[3].start();
+    const int bo_end   = window_execution[3].end();
+    const int bo_step  = window_execution[3].step();
+
+    for (int bo = bo_start; bo < bo_end; bo += bo_step)
+    {
+        const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
+        uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
+
+        for (int yo = yo_start; yo < yo_end; yo += yo_step)
+        {
+            // Floating-point coordinate
+            float yi_f = ((yo + sampling_offset) * scale_y);
+            int   yi   = 0;
+            if (align_corners)
+            {
+                yi = utils::rounding::round_half_away_from_zero(yi_f);
+            }
+            else
+            {
+                yi = static_cast<int>(std::floor(yi_f));
+            }
+
+            for (int xo = xo_start; xo < xo_end; xo += xo_step)
+            {
+                // Floating-point coordinate
+                float xi_f = ((xo + sampling_offset) * scale_x);
+                int   xi   = 0;
+                if (align_corners)
+                {
+                    xi = utils::rounding::round_half_away_from_zero(xi_f);
+                }
+                else
+                {
+                    xi = static_cast<int>(std::floor(xi_f));
+                }
+
+                const uint8_t *in_ptr  = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
+                uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
+
+                int cout = 0;
+                for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                {
+                    auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+                    wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
+                }
+
+                for (; cout < out_dim_ch; ++cout)
+                {
+                    auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+                    *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void bilinear_neon_scale(const ITensor *src,
+                         ITensor       *dst,
+                         const ITensor *offsets,
+                         const ITensor *dx,
+                         const ITensor *dy,
+                         BorderMode     border_mode,
+                         PixelValue     constant_border_value,
+                         float          sampling_offset,
+                         bool           align_corners,
+                         const Window  &window)
+{
+    ARM_COMPUTE_UNUSED(offsets);
+    ARM_COMPUTE_UNUSED(dx);
+    ARM_COMPUTE_UNUSED(dy);
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    // Compute the ratio between source and destination dimensions
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    const int in_stride_y  = src->info()->strides_in_bytes()[1];
+    const int in_stride_z  = src->info()->strides_in_bytes()[2];
+    const int in_stride_w  = src->info()->strides_in_bytes()[3];
+    const int out_stride_y = dst->info()->strides_in_bytes()[1];
+    const int out_stride_z = dst->info()->strides_in_bytes()[2];
+    const int out_stride_w = dst->info()->strides_in_bytes()[3];
+    const int in_dim_w     = src->info()->dimension(1);
+    const int in_dim_h     = src->info()->dimension(2);
+    const int out_dim_ch   = dst->info()->dimension(0);
+    const int step_cout    = 16 / sizeof(T);
+
+    Window window_execution = window;
+    window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Window win_in_out(window);
+    win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(src, win_in_out);
+    Iterator out(dst, win_in_out);
+
+    const int xo_start = window_execution.y().start();
+    const int xo_end   = window_execution.y().end();
+    const int xo_step  = window_execution.y().step();
+    const int yo_start = window_execution.z().start();
+    const int yo_end   = window_execution.z().end();
+    const int yo_step  = window_execution.z().step();
+    const int bo_start = window_execution[3].start();
+    const int bo_end   = window_execution[3].end();
+    const int bo_step  = window_execution[3].step();
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
+
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
+        {
+            const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
+            uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
+
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
+            {
+                // Floating-point coordinate
+                const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
+                // Integer coordinate
+                const auto yi = static_cast<int>(std::floor(yi_f));
+                // Weight for the y coordinate
+                const auto a1 = (yi_f - static_cast<float>(yi));
+                const auto b1 = (1.f - a1);
+
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
+                {
+                    // Floating-point coordinate
+                    const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
+                    // Integer coordinate
+                    const auto xi = static_cast<int>(std::floor(xi_f));
+                    // Weight for the x coordinate
+                    const auto a = (xi_f - static_cast<float>(xi));
+                    const auto b = (1.f - a);
+
+                    const auto s00_s = static_cast<T>(b * b1);
+                    const auto s01_s = static_cast<T>(a * b1);
+                    const auto s10_s = static_cast<T>(b * a1);
+                    const auto s11_s = static_cast<T>(a * a1);
+
+                    const uint8_t *in_ptr  = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
+                    uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
+
+                    int cout = 0;
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    {
+                        auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+                        auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+                        auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+                        auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+                        if ((yi >= 0) && (yi < in_dim_h))
+                        {
+                            if ((xi >= 0) && (xi < in_dim_w))
+                            {
+                                in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+                            }
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            {
+                                in01 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+                            }
+                        }
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        {
+                            if ((xi >= 0) && (xi < in_dim_w))
+                            {
+                                in10 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+                            }
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            {
+                                in11 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                            }
+                        }
+
+                        const auto s00  = wrapper::vdup_n(s00_s, ExactTagType{});
+                        const auto s01  = wrapper::vdup_n(s01_s, ExactTagType{});
+                        const auto s10  = wrapper::vdup_n(s10_s, ExactTagType{});
+                        const auto s11  = wrapper::vdup_n(s11_s, ExactTagType{});
+                        auto       out0 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+                        out0            = wrapper::vmla(out0, in00, s00);
+                        out0            = wrapper::vmla(out0, in01, s01);
+                        out0            = wrapper::vmla(out0, in10, s10);
+                        out0            = wrapper::vmla(out0, in11, s11);
+                        wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
+                    }
+
+                    for (; cout < out_dim_ch; ++cout)
+                    {
+                        auto in00 = static_cast<T>(const_border_value);
+                        auto in01 = static_cast<T>(const_border_value);
+                        auto in10 = static_cast<T>(const_border_value);
+                        auto in11 = static_cast<T>(const_border_value);
+                        if ((yi >= 0) && (yi < in_dim_h))
+                        {
+                            if ((xi >= 0) && (xi < in_dim_w))
+                            {
+                                in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+                            }
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            {
+                                in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+                            }
+                        }
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        {
+                            if ((xi >= 0) && (xi < in_dim_w))
+                            {
+                                in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+                            }
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            {
+                                in11 = *(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                            }
+                        }
+                        auto out0 = static_cast<T>(0);
+                        out0 += in00 * s00_s;
+                        out0 += in01 * s01_s;
+                        out0 += in10 * s10_s;
+                        out0 += in11 * s11_s;
+                        *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
+                    }
+                }
+            }
+        }
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
+        {
+            const uint8_t *in_ptr  = in.ptr() + bo * in_stride_w;
+            uint8_t       *out_ptr = out.ptr() + bo * out_stride_w;
+
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
+            {
+                // Floating-point coordinate
+                const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
+                // Integer coordinate
+                const auto yi = static_cast<int>(std::floor(yi_f));
+                // Weight for the y coordinate
+                const auto a1 = (yi_f - static_cast<float>(yi));
+                const auto b1 = (1.f - a1);
+
+                const int yi0 = utility::clamp<int>(yi, 0, in_dim_h - 1);
+                const int yi1 = utility::clamp<int>(yi + 1, 0, in_dim_h - 1);
+
+                const int yi0_offset = yi0 * in_stride_z;
+                const int yi1_offset = yi1 * in_stride_z;
+
+                const int y_offset = yo * out_stride_z;
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
+                {
+                    // Floating-point coordinate
+                    const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
+                    // Integer coordinate
+                    const auto xi = static_cast<int>(std::floor(xi_f));
+                    // Weight for the x coordinate
+                    const auto a = (xi_f - static_cast<float>(xi));
+                    const auto b = (1.f - a);
+
+                    const auto s00_s = static_cast<T>(b * b1);
+                    const auto s01_s = static_cast<T>(a * b1);
+                    const auto s10_s = static_cast<T>(b * a1);
+                    const auto s11_s = static_cast<T>(a * a1);
+
+                    const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
+                    const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
+                    const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
+                    const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
+
+                    const int xi0 = utility::clamp<int>(xi, 0, in_dim_w - 1);
+                    const int xi1 = utility::clamp<int>(xi + 1, 0, in_dim_w - 1);
+
+                    const int xi0_offset = xi0 * in_stride_y;
+                    const int xi1_offset = xi1 * in_stride_y;
+
+                    const int offset = xo * out_stride_y + y_offset;
+
+                    int cout = 0;
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    {
+                        const auto in00 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const auto in01 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const auto in10 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const auto in11 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+
+                        auto out0 = wrapper::vmul(in00, s00);
+                        out0      = wrapper::vmla(out0, in01, s01);
+                        out0      = wrapper::vmla(out0, in10, s10);
+                        out0      = wrapper::vmla(out0, in11, s11);
+                        wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0);
+                    }
+
+                    for (; cout < out_dim_ch; ++cout)
+                    {
+                        const T in00 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const T in01 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const T in10 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const T in11 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+
+                        T out0 = in00 * s00_s;
+                        out0 += in01 * s01_s;
+                        out0 += in10 * s10_s;
+                        out0 += in11 * s11_s;
+                        *(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T))) = out0;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <typename T>
+void common_neon_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
+    {
+        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
+    }
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp
new file mode 100644
index 0000000000..62a821daa5
--- /dev/null
+++ b/src/cpu/kernels/scale/neon/qasymm8.cpp
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/cpu/kernels/scale/neon/list.h"
+
+namespace arm_compute
+{
+namespace
+{
+void qasymm8_neon_scale_bilinear(const ITensor *src,
+                                 ITensor       *dst,
+                                 const ITensor *offsets,
+                                 const ITensor *dx,
+                                 const ITensor *dy,
+                                 BorderMode     border_mode,
+                                 PixelValue     constant_border_value,
+                                 float          sampling_offset,
+                                 bool           align_corners,
+                                 const Window  &window)
+{
+    // Data layout is NHWC
+    const int32_t input_width  = src->info()->dimension(1);
+    const int32_t input_height = src->info()->dimension(2);
+
+    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
+
+    // Compute the ratio between source and destination dimensions
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+        const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
+        const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
+
+        // Compute the ratio between source height and destination height
+        Window win_off;
+        win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        // Don't increment in X and Y direction for the input tensor
+        // A pointer to the start of this plane is needed as base for the precomputed offsets
+        Window win_in(window);
+        win_in.set(1, Window::Dimension(0, 0, 0));
+        win_in.set(2, Window::Dimension(0, 0, 0));
+
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        {
+            win_off.set(d, Window::Dimension(0, 0, 0));
+        }
+
+        Iterator in(src, win_in);
+        Iterator out(dst, window);
+
+        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
+
+                const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
+                const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
+                const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
+                const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
+        using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
+
+        const int     in_stride_x  = src->info()->strides_in_bytes()[1];
+        const int     in_stride_y  = src->info()->strides_in_bytes()[2];
+        const int     in_stride_b  = src->info()->strides_in_bytes()[3];
+        const int     out_stride_x = dst->info()->strides_in_bytes()[1];
+        const int     out_stride_y = dst->info()->strides_in_bytes()[2];
+        const int     out_stride_b = dst->info()->strides_in_bytes()[3];
+        const int     out_dim_ch   = dst->info()->dimension(0);
+        constexpr int step_cout    = 16;
+
+        Window window_execution = window;
+        window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Window win_in_out(window);
+        win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        Iterator in(src, win_in_out);
+        Iterator out(dst, win_in_out);
+
+        const int xo_start = window_execution[1].start();
+        const int xo_end   = window_execution[1].end();
+        const int xo_step  = window_execution[1].step();
+        const int yo_start = window_execution[2].start();
+        const int yo_end   = window_execution[2].end();
+        const int yo_step  = window_execution[2].step();
+        const int bo_start = window_execution[3].start();
+        const int bo_end   = window_execution[3].end();
+        const int bo_step  = window_execution[3].step();
+
+        const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
+
+        const float32x4_t vscale_in  = wrapper::vdup_n(iq_info.scale, FloatTagType{});
+        const int32x4_t   voffset_in = wrapper::vdup_n(iq_info.offset, Int32TagType{}); // Offsets will be Int32
+
+        const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{});
+        const float32x4_t voffset_o   = vdupq_n_f32(oq_info.offset);
+
+        const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
+        const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
+
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
+        {
+            const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
+            uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
+
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
+            {
+                // Floating-point coordinate
+                const float yi_f = yo * scale_y + fp_coord_offset_y;
+                // Integer coordinate
+                const int yi = static_cast<int>(std::floor(yi_f));
+                // Weight for the y coordinate
+                const float a1 = (yi_f - static_cast<float>(yi));
+                const float b1 = (1.f - a1);
+
+                const int yi0 = utility::clamp<int>(yi, 0, input_height - 1);
+                const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1);
+
+                const uint8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y;
+                const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
+
+                uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
+                {
+                    // Floating-point coordinate
+                    const float xi_f = xo * scale_x + fp_coord_offset_x;
+                    // Integer coordinate
+                    const int xi = static_cast<int>(std::floor(xi_f));
+                    // Weight for the x coordinate
+                    const float a = (xi_f - static_cast<float>(xi));
+                    const float b = (1.f - a);
+
+                    const float s00_s = b * b1;
+                    const float s01_s = a * b1;
+                    const float s10_s = b * a1;
+                    const float s11_s = a * a1;
+
+                    const auto s00 = wrapper::vdup_n(s00_s, FloatTagType{});
+                    const auto s01 = wrapper::vdup_n(s01_s, FloatTagType{});
+                    const auto s10 = wrapper::vdup_n(s10_s, FloatTagType{});
+                    const auto s11 = wrapper::vdup_n(s11_s, FloatTagType{});
+
+                    const int xi0 = utility::clamp<int>(xi, 0, input_width - 1);
+                    const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1);
+
+                    const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x;
+                    const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x;
+
+                    uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
+
+                    int cout = 0;
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    {
+                        const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
+                        const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
+                        const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(uint8_t));
+                        const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(uint8_t));
+
+                        const uint16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
+                        const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
+
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)),
+                            vscale_in);
+                        const auto in00_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)),
+                            vscale_in);
+
+                        const uint16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
+                        const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
+
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)),
+                            vscale_in);
+                        const auto in01_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)),
+                            vscale_in);
+
+                        const uint16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
+                        const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
+
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)),
+                            vscale_in);
+                        const auto in10_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)),
+                            vscale_in);
+
+                        const uint16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
+                        const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
+
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)),
+                            vscale_in);
+                        const auto in11_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)),
+                            vscale_in);
+
+                        auto out_0 = wrapper::vmul(in00_0, s00);
+                        out_0      = wrapper::vmla(out_0, in01_0, s01);
+                        out_0      = wrapper::vmla(out_0, in10_0, s10);
+                        out_0      = wrapper::vmla(out_0, in11_0, s11);
+
+                        auto out_1 = wrapper::vmul(in00_1, s00);
+                        out_1      = wrapper::vmla(out_1, in01_1, s01);
+                        out_1      = wrapper::vmla(out_1, in10_1, s10);
+                        out_1      = wrapper::vmla(out_1, in11_1, s11);
+
+                        auto out_2 = wrapper::vmul(in00_2, s00);
+                        out_2      = wrapper::vmla(out_2, in01_2, s01);
+                        out_2      = wrapper::vmla(out_2, in10_2, s10);
+                        out_2      = wrapper::vmla(out_2, in11_2, s11);
+
+                        auto out_3 = wrapper::vmul(in00_3, s00);
+                        out_3      = wrapper::vmla(out_3, in01_3, s01);
+                        out_3      = wrapper::vmla(out_3, in10_3, s10);
+                        out_3      = wrapper::vmla(out_3, in11_3, s11);
+
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o));
+                        const auto out_1_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o));
+                        const auto out_2_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
+                        const auto out_3_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o));
+                        const auto out_1_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o));
+                        const auto out_2_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
+                        const auto out_3_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
+
+                        wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
+                    }
+
+                    for (; cout < out_dim_ch; ++cout)
+                    {
+                        const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
+                        const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
+                        const uint8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(uint8_t));
+                        const uint8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(uint8_t));
+
+                        const float in00_f = (static_cast<int32_t>(in00) - iq_info.offset) * iq_info.scale;
+                        const float in01_f = (static_cast<int32_t>(in01) - iq_info.offset) * iq_info.scale;
+                        const float in10_f = (static_cast<int32_t>(in10) - iq_info.offset) * iq_info.scale;
+                        const float in11_f = (static_cast<int32_t>(in11) - iq_info.offset) * iq_info.scale;
+
+                        float out = in00_f * s00_s;
+                        out += in01_f * s01_s;
+                        out += in10_f * s10_s;
+                        out += in11_f * s11_s;
+
+                        // Rounding modes of vector and scalar loops should match
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info);
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) =
+                            quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+namespace cpu
+{
+void qasymm8_neon_scale(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
+    {
+        if (src->info()->quantization_info() == dst->info()->quantization_info())
+        {
+            u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
+        }
+        else
+        {
+            qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                        align_corners, window);
+        }
+    }
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..5a885178a7
--- /dev/null
+++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/cpu/kernels/scale/neon/list.h"
+
+namespace arm_compute
+{
+namespace
+{
+void qasymm8_signed_neon_scale_bilinear(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *offsets,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        BorderMode     border_mode,
+                                        PixelValue     constant_border_value,
+                                        float          sampling_offset,
+                                        bool           align_corners,
+                                        const Window  &window)
+{
+    // Data layout is NHWC
+    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
+
+    const int32_t input_width  = src->info()->dimension(1);
+    const int32_t input_height = src->info()->dimension(2);
+
+    // Compute the ratio between source and destination dimensions
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+        const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
+        const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
+
+        Window win_off;
+        win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        // Don't increment in X and Y direction for the input tensor
+        // A pointer to the start of this plane is needed as base for the precomputed offsets
+        Window win_in(window);
+        win_in.set(1, Window::Dimension(0, 0, 0));
+        win_in.set(2, Window::Dimension(0, 0, 0));
+
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        {
+            win_off.set(d, Window::Dimension(0, 0, 0));
+        }
+
+        Iterator in(src, win_in);
+        Iterator out(dst, window);
+
+        const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
+
+                const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
+                const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
+                const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
+                const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
+        using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
+
+        const int     in_stride_x  = src->info()->strides_in_bytes()[1];
+        const int     in_stride_y  = src->info()->strides_in_bytes()[2];
+        const int     in_stride_b  = src->info()->strides_in_bytes()[3];
+        const int     out_stride_x = dst->info()->strides_in_bytes()[1];
+        const int     out_stride_y = dst->info()->strides_in_bytes()[2];
+        const int     out_stride_b = dst->info()->strides_in_bytes()[3];
+        const int     out_dim_ch   = dst->info()->dimension(0);
+        constexpr int step_cout    = 16;
+
+        Window window_execution = window;
+        window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Window win_in_out(window);
+        win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        Iterator in(src, win_in_out);
+        Iterator out(dst, win_in_out);
+
+        const int xo_start = window_execution[1].start();
+        const int xo_end   = window_execution[1].end();
+        const int xo_step  = window_execution[1].step();
+        const int yo_start = window_execution[2].start();
+        const int yo_end   = window_execution[2].end();
+        const int yo_step  = window_execution[2].step();
+        const int bo_start = window_execution[3].start();
+        const int bo_end   = window_execution[3].end();
+        const int bo_step  = window_execution[3].step();
+
+        const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
+        const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
+
+        const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
+
+        const float32x4_t vscale_in  = wrapper::vdup_n(iq_info.scale, FloatTagType{});
+        const int32x4_t   voffset_in = wrapper::vdup_n(iq_info.offset, Int32TagType{}); // Offsets will be Int32
+
+        const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{});
+        const float32x4_t voffset_o   = vdupq_n_f32(oq_info.offset);
+
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
+        {
+            const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
+            int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
+
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
+            {
+                // Floating-point coordinate
+                const float yi_f = yo * scale_y + fp_coord_offset_y;
+                // Integer coordinate
+                const int yi = static_cast<int>(std::floor(yi_f));
+                // Weight for the y coordinate
+                const float a1 = (yi_f - static_cast<float>(yi));
+                const float b1 = (1.f - a1);
+
+                const int yi0 = utility::clamp<int>(yi, 0, input_height - 1);
+                const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1);
+
+                const int8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y;
+                const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
+
+                int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
+                {
+                    // Floating-point coordinate
+                    const float xi_f = xo * scale_x + fp_coord_offset_x;
+                    // Integer coordinate
+                    const int xi = static_cast<int>(std::floor(xi_f));
+                    // Weight for the x coordinate
+                    const float a = (xi_f - static_cast<float>(xi));
+                    const float b = (1.f - a);
+
+                    const float s00_s = b * b1;
+                    const float s01_s = a * b1;
+                    const float s10_s = b * a1;
+                    const float s11_s = a * a1;
+
+                    const auto s00 = wrapper::vdup_n(s00_s, FloatTagType{});
+                    const auto s01 = wrapper::vdup_n(s01_s, FloatTagType{});
+                    const auto s10 = wrapper::vdup_n(s10_s, FloatTagType{});
+                    const auto s11 = wrapper::vdup_n(s11_s, FloatTagType{});
+
+                    const int xi0 = utility::clamp<int>(xi, 0, input_width - 1);
+                    const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1);
+
+                    const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x;
+                    const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x;
+                    const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x;
+
+                    int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
+
+                    int cout = 0;
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    {
+                        const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
+                        const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
+                        const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(int8_t));
+                        const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(int8_t));
+
+                        const int16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
+                        const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
+
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)),
+                                          vscale_in);
+
+                        const int16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
+                        const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
+
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)),
+                                          vscale_in);
+
+                        const int16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
+                        const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
+
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)),
+                                          vscale_in);
+
+                        const int16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
+                        const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
+
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)),
+                                          vscale_in);
+
+                        auto out_0 = wrapper::vmul(in00_0, s00);
+                        out_0      = wrapper::vmla(out_0, in01_0, s01);
+                        out_0      = wrapper::vmla(out_0, in10_0, s10);
+                        out_0      = wrapper::vmla(out_0, in11_0, s11);
+
+                        auto out_1 = wrapper::vmul(in00_1, s00);
+                        out_1      = wrapper::vmla(out_1, in01_1, s01);
+                        out_1      = wrapper::vmla(out_1, in10_1, s10);
+                        out_1      = wrapper::vmla(out_1, in11_1, s11);
+
+                        auto out_2 = wrapper::vmul(in00_2, s00);
+                        out_2      = wrapper::vmla(out_2, in01_2, s01);
+                        out_2      = wrapper::vmla(out_2, in10_2, s10);
+                        out_2      = wrapper::vmla(out_2, in11_2, s11);
+
+                        auto out_3 = wrapper::vmul(in00_3, s00);
+                        out_3      = wrapper::vmla(out_3, in01_3, s01);
+                        out_3      = wrapper::vmla(out_3, in10_3, s10);
+                        out_3      = wrapper::vmla(out_3, in11_3, s11);
+
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o));
+                        const auto out_1_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o));
+                        const auto out_2_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
+                        const auto out_3_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto out_0_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o));
+                        const auto out_1_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o));
+                        const auto out_2_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
+                        const auto out_3_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
+
+                        wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
+                    }
+
+                    for (; cout < out_dim_ch; ++cout)
+                    {
+                        const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
+                        const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
+                        const int8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(int8_t));
+                        const int8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(int8_t));
+
+                        const float in00_f = (static_cast<int32_t>(in00) - iq_info.offset) * iq_info.scale;
+                        const float in01_f = (static_cast<int32_t>(in01) - iq_info.offset) * iq_info.scale;
+                        const float in10_f = (static_cast<int32_t>(in10) - iq_info.offset) * iq_info.scale;
+                        const float in11_f = (static_cast<int32_t>(in11) - iq_info.offset) * iq_info.scale;
+
+                        float out = in00_f * s00_s;
+                        out += in01_f * s01_s;
+                        out += in10_f * s10_s;
+                        out += in11_f * s11_s;
+
+                        // Rounding modes of vector and scalar loops should match
+#if defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info);
+#else  // defined(__aarch64__) && !defined(BARE_METAL)
+                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) =
+                            quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
+#endif // defined(__aarch64__) && !defined(BARE_METAL)
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+namespace cpu
+{
+void qasymm8_signed_neon_scale(const ITensor      *src,
+                               ITensor            *dst,
+                               const ITensor      *offsets,
+                               const ITensor      *dx,
+                               const ITensor      *dy,
+                               InterpolationPolicy policy,
+                               BorderMode          border_mode,
+                               PixelValue          constant_border_value,
+                               float               sampling_offset,
+                               bool                align_corners,
+                               const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
+    {
+        if (src->info()->quantization_info() == dst->info()->quantization_info() &&
+            border_mode == BorderMode::REPLICATE)
+        {
+            s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
+        }
+        else
+        {
+            qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value,
+                                               sampling_offset, align_corners, window);
+        }
+    }
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp
new file mode 100644
index 0000000000..cb28f4cb1c
--- /dev/null
+++ b/src/cpu/kernels/scale/sve/fp16.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+void fp16_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
+}
+} // namespace
+namespace cpu
+{
+void fp16_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp
new file mode 100644
index 0000000000..cbb345edbb
--- /dev/null
+++ b/src/cpu/kernels/scale/sve/fp32.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+void fp32_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        out);
+}
+} // namespace
+namespace cpu
+{
+void fp32_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp
new file mode 100644
index 0000000000..df950b1789
--- /dev/null
+++ b/src/cpu/kernels/scale/sve/integer.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+void u8_sve_scale_nearest(const ITensor *src,
+                          ITensor       *dst,
+                          const ITensor *offsets,
+                          float          sampling_offset,
+                          bool           align_corners,
+                          const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
+}
+
+void s16_sve_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
+}
+} // namespace
+namespace cpu
+{
+void u8_sve_scale(const ITensor      *src,
+                  ITensor            *dst,
+                  const ITensor      *offsets,
+                  const ITensor      *dx,
+                  const ITensor      *dy,
+                  InterpolationPolicy policy,
+                  BorderMode          border_mode,
+                  PixelValue          constant_border_value,
+                  float               sampling_offset,
+                  bool                align_corners,
+                  const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not Implemented");
+    }
+}
+
+void s16_sve_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not Implemented");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h
new file mode 100644
index 0000000000..aff741a4a7
--- /dev/null
+++ b/src/cpu/kernels/scale/sve/list.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_SCALE_LIST_H
+#define SRC_CORE_SVE_KERNELS_SCALE_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
+
+DECLARE_SCALE_KERNEL(fp16_sve_scale);
+DECLARE_SCALE_KERNEL(fp32_sve_scale);
+DECLARE_SCALE_KERNEL(s16_sve_scale);
+DECLARE_SCALE_KERNEL(u8_sve_scale);
+DECLARE_SCALE_KERNEL(qasymm8_sve_scale);
+DECLARE_SCALE_KERNEL(qasymm8_signed_sve_scale);
+
+#undef DECLARE_SCALE_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_SVE_KERNELS_SCALE_LIST_H */
diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp
new file mode 100644
index 0000000000..0fc794c6c2
--- /dev/null
+++ b/src/cpu/kernels/scale/sve/qasymm8.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+void qasymm8_sve_scale_nearest(const ITensor *src,
+                               ITensor       *dst,
+                               const ITensor *offsets,
+                               float          sampling_offset,
+                               bool           align_corners,
+                               const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
+}
+} // namespace
+namespace cpu
+{
+void qasymm8_sve_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not Implemented");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
new file mode 100644
index 0000000000..68ea01e29e
--- /dev/null
+++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+void qasymm8_signed_sve_scale_nearest(const ITensor *src,
+                                      ITensor       *dst,
+                                      const ITensor *offsets,
+                                      float          sampling_offset,
+                                      bool           align_corners,
+                                      const Window  &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
+}
+} // namespace
+namespace cpu
+{
+void qasymm8_signed_sve_scale(const ITensor      *src,
+                              ITensor            *dst,
+                              const ITensor      *offsets,
+                              const ITensor      *dx,
+                              const ITensor      *dy,
+                              InterpolationPolicy policy,
+                              BorderMode          border_mode,
+                              PixelValue          constant_border_value,
+                              float               sampling_offset,
+                              bool                align_corners,
+                              const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not Implemented");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..38a58099bd
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/fp16.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_f16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window);
+}
+void neon_f16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<float16_t>(c, x, y, output, window);
+}
+
+} // namespace cpu
+
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..50a80cb338
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/fp32.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_f32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_32<float, uint32x4_t>(c, x, y, output, window);
+}
+void neon_f32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<float>(c, x, y, output, window);
+}
+
+} // namespace cpu
+
+} // namespace arm_compute
diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h
new file mode 100644
index 0000000000..7ce640b6ff
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/impl.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, typename VectorType>
+void select_op(const ITensor *cond,
+               const ITensor *in1,
+               const ITensor *in2,
+               ITensor       *out,
+               const Window  &window,
+               const int      window_step_x,
+               const int      window_start_x,
+               const int      window_end_x,
+               const int      limit,
+               VectorType (*condition_conversion)(const uint8_t *))
+{
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator condition(cond, win);
+    Iterator input1(in1, win);
+    Iterator input2(in2, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+            const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
+            const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+            int x = window_start_x;
+            for (; x <= limit; x += window_step_x)
+            {
+                const auto c = (*condition_conversion)(condition_ptr + x);
+                const auto a = wrapper::vloadq(input1_ptr + x);
+                const auto b = wrapper::vloadq(input2_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                const auto c      = *(condition_ptr + x);
+                const auto a      = *(input1_ptr + x);
+                const auto b      = *(input2_ptr + x);
+                *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+            }
+        },
+        condition, input1, input2, output);
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+        });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+        });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+        });
+}
+
+template <typename ScalarType>
+void select_op_not_same_rank(
+    const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
+    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
+    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
+    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
+
+    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
+    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
+    int       offset     = 0;
+    const int step       = 16 / in1->info()->element_size();
+
+    for (int i = 0; i < outer_size; ++i)
+    {
+        int        x         = offset;
+        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
+        for (; x <= offset + inner_size - step; x += step)
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
+        }
+        if (x <= offset + inner_size - (step / 2))
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
+            x += step / 2;
+        }
+        for (; x < offset + inner_size; ++x)
+        {
+            *(output_ptr + x) = *(input_ptr + x);
+        }
+        offset += inner_size;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp
new file mode 100644
index 0000000000..135087c261
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/integer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_s8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window);
+}
+void neon_s16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window);
+}
+void neon_s32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window);
+}
+void neon_s8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<int8_t>(c, x, y, output, window);
+}
+void neon_s16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<int16_t>(c, x, y, output, window);
+}
+void neon_s32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<int32_t>(c, x, y, output, window);
+}
+void neon_u8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window);
+}
+void neon_u16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window);
+}
+void neon_u32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window);
+}
+void neon_u8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<uint8_t>(c, x, y, output, window);
+}
+void neon_u16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<uint16_t>(c, x, y, output, window);
+}
+void neon_u32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<uint32_t>(c, x, y, output, window);
+}
+
+} // namespace cpu
+
+} // namespace arm_compute
diff --git a/src/cpu/kernels/select/list.h b/src/cpu/kernels/select/list.h
new file mode 100644
index 0000000000..c33a25f6d6
--- /dev/null
+++ b/src/cpu/kernels/select/list.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_SELECT_LIST_H
+#define SRC_CORE_NEON_KERNELS_SELECT_LIST_H
+
+#include "arm_compute/core/ITensor.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_SELECT_KERNEL(func_name) \
+    void func_name(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+
+DECLARE_SELECT_KERNEL(neon_s8_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_s16_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_s32_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_u8_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_u16_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_u32_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_f16_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_f32_select_same_rank);
+
+DECLARE_SELECT_KERNEL(neon_s8_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_s16_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_s32_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_u8_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_u16_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_u32_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_f16_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_f32_select_not_same_rank);
+
+#undef DECLARE_RANGE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_SELECT_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..425fcf7ac6
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+template <bool IS_LOG>
+void neon_fp16_softmax(const ITensor *in,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       int            axis,
+                       const Window  &window,
+                       const float   *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(lut_ptr);
+    if (axis == 0)
+    {
+        return neon_softmax_x_float<float16_t, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+    else
+    {
+        return neon_softmax_non_x_float<float16_t, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+}
+
+template void neon_fp16_softmax<true>(const ITensor *in,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      const float    beta,
+                                      int            axis,
+                                      const Window  &window,
+                                      const float   *lut_ptr);
+template void neon_fp16_softmax<false>(const ITensor *in,
+                                       void *const    tmp,
+                                       ITensor       *out,
+                                       const float    beta,
+                                       int            axis,
+                                       const Window  &window,
+                                       const float   *lut_ptr);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..a64946eb74
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+template <bool IS_LOG>
+void neon_fp32_softmax(const ITensor *in,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       int            axis,
+                       const Window  &window,
+                       const float   *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(lut_ptr);
+    if (axis == 0)
+    {
+        return neon_softmax_x_float<float, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+    else
+    {
+        return neon_softmax_non_x_float<float, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+}
+
+template void neon_fp32_softmax<true>(const ITensor *in,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      const float    beta,
+                                      int            axis,
+                                      const Window  &window,
+                                      const float   *lut_ptr);
+template void neon_fp32_softmax<false>(const ITensor *in,
+                                       void *const    tmp,
+                                       ITensor       *out,
+                                       const float    beta,
+                                       int            axis,
+                                       const Window  &window,
+                                       const float   *lut_ptr);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
new file mode 100644
index 0000000000..31baf8a9df
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+#include "support/SaturateCast.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T, bool IS_LOG>
+void neon_softmax_x_quantized(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(axis);
+
+    static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
+                  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
+
+    const int input_width = in->info()->valid_region().shape.x();
+
+    const float       scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
+
+    constexpr int vec_size = 16;
+
+#ifndef __aarch64__
+    const int sum_stages = log2(vec_size >> 1);
+#endif // __aarch64__
+
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+            float   *tmp_ptr = reinterpret_cast<float *>(tmp);
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
+
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
+
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // Compute Max
+
+            float sum_transformed{};
+
+            /* Compute exponentials and sum */
+            {
+                /* Get max value */
+                const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+
+                /* Init sum to zero */
+                float32x4x4_t vec_sum = {
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                };
+
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    auto vec_elements              = wrapper::vloadq(in_ptr + x);
+                    vec_elements                   = wrapper::vqsub(vec_max, vec_elements);
+                    float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+                    if (IS_LOG)
+                    {
+                        vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+                        vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+                        vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+                        vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+                    }
+                    else
+                    {
+                        vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+                        vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+                        vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+                        vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+                    }
+
+                    vst4q_f32(tmp_ptr + x, vec_elements_flt);
+                }
+
+                /* Reduce sum */
+                const float32x4_t sum_16_byte =
+                    vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+
+                float sum;
+
+#ifdef __aarch64__
+                sum = wrapper::vaddv(sum_16_byte);
+#else  // __aarch64__
+                auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
+                sum_res      = vpadd_f32(sum_res, sum_res);
+                sum          = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
+
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    float element{};
+                    if (IS_LOG)
+                    {
+                        element = (max_val - in_ptr[x]) * scale_beta;
+                        sum += std::exp(element);
+                    }
+                    else
+                    {
+                        element = std::exp((max_val - in_ptr[x]) * scale_beta);
+                        sum += element;
+                    }
+
+                    tmp_ptr[x] = element;
+                }
+
+                if (!IS_LOG)
+                {
+                    sum_transformed = 256.f / sum;
+                }
+                else
+                {
+                    sum_transformed = std::log(sum);
+                }
+            } // Compute exponentials and sum
+
+            /* Normalize exponentials */
+            {
+                constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+
+                const float32x4_t sum_vec = vdupq_n_f32(sum_transformed);
+
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
+                    float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+                    int_vec_type  normalized_value{};
+                    if (IS_LOG)
+                    {
+                        const float32x4x4_t sub = {
+                            vsubq_f32(vec_in.val[0], sum_vec),
+                            vsubq_f32(vec_in.val[1], sum_vec),
+                            vsubq_f32(vec_in.val[2], sum_vec),
+                            vsubq_f32(vec_in.val[3], sum_vec),
+                        };
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+                    }
+                    else
+                    {
+                        float32x4x4_t mul = {
+                            vmulq_f32(vec_in.val[0], sum_vec),
+                            vmulq_f32(vec_in.val[1], sum_vec),
+                            vmulq_f32(vec_in.val[2], sum_vec),
+                            vmulq_f32(vec_in.val[3], sum_vec),
+                        };
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+                            mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
+                            mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
+                            mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
+                            mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                        }
+
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+                    }
+                    wrapper::vstore(out_ptr + x, normalized_value);
+                }
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    if (IS_LOG)
+                    {
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed);
+                    }
+                    else
+                    {
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) -
+                                                                   (is_qasymm8_signed ? 128.f : 0));
+                    }
+                }
+            } // Normalize exponentials
+        },
+        in_it, out_it);
+}
+
+template <typename T, bool IS_LOG>
+void neon_softmax_non_x_quantized(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window)
+{
+    static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
+                  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
+
+    const float       scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
+
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    constexpr int      vec_size        = 16;
+    const ITensorInfo *in_info         = in->info();
+    const ITensorInfo *out_info        = out->info();
+    const int          x_width         = in_info->valid_region().shape.x();
+    const int          in_axis_stride  = in_info->strides_in_bytes()[axis];
+    const int          out_axis_stride = out_info->strides_in_bytes()[axis];
+    const int          tmp_axis_stride = in_axis_stride;
+    const int          axis_width      = in_info->dimension(axis);
+    const int          end_actual      = std::min(window[0].end(), x_width);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &winCoords)
+        {
+            const bool vector_exceeds_bounds = ((winCoords[0] + vec_size) > end_actual);
+
+            int num_remaining         = (end_actual - winCoords[0]);
+            int num_remaining_full    = num_remaining / 4;
+            int num_remaining_partial = num_remaining % 4;
+
+            /* Get pointers */
+            const uint8_t *in_ptr  = in_it.ptr();
+            uint8_t       *out_ptr = out_it.ptr();
+            uint8_t       *tmp_ptr = reinterpret_cast<uint8_t *>(tmp);
+
+            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+
+            /* Compute Max */
+            {
+                if (!vector_exceeds_bounds)
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        const auto current_value =
+                            wrapper::vloadq((i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr));
+                        vec_max = wrapper::vmax(vec_max, current_value);
+                    }
+                }
+                else
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        const T *const base_ptr_in = ((i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr));
+                        int            j           = 0;
+                        for (; j < num_remaining; ++j)
+                        {
+                            const T current_value = *(base_ptr_in + j);
+                            vec_max[j]            = std::max(vec_max[j], current_value);
+                        }
+                    }
+                }
+            } // Compute Max
+
+            float32x4x4_t vec_sum_transformed = {
+                vdupq_n_f32(0.f),
+                vdupq_n_f32(0.f),
+                vdupq_n_f32(0.f),
+                vdupq_n_f32(0.f),
+            };
+
+            /* Compute exponentials and sum */
+            {
+                /* Init sum to zero */
+                float32x4x4_t vec_sum = vec_sum_transformed;
+
+                auto vec_elements = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+                float32x4x4_t vec_elements_flt;
+
+                if (!vector_exceeds_bounds)
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        vec_elements     = wrapper::vloadq((i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr));
+                        vec_elements     = wrapper::vqsub(vec_max, vec_elements);
+                        vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+                        if (IS_LOG)
+                        {
+                            vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+                            vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+                            vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+                            vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+                            vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+                            vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+                            vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+                            vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+                        }
+                        else
+                        {
+                            vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+                            vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+                            vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+                            vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+                            vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+                            vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+                            vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+                            vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+                        }
+                        vst4q_f32((i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr), vec_elements_flt);
+                    }
+
+                    auto vec_256 = wrapper::vdup_n(static_cast<float32_t>(256.f), ExactTagType{});
+                    if (!IS_LOG)
+                    {
+                        vec_sum_transformed.val[0] = wrapper::vdiv(vec_256, vec_sum.val[0]);
+                        vec_sum_transformed.val[1] = wrapper::vdiv(vec_256, vec_sum.val[1]);
+                        vec_sum_transformed.val[2] = wrapper::vdiv(vec_256, vec_sum.val[2]);
+                        vec_sum_transformed.val[3] = wrapper::vdiv(vec_256, vec_sum.val[3]);
+                    }
+                    else
+                    {
+                        vec_sum_transformed.val[0] = wrapper::vlog(vec_sum.val[0]);
+                        vec_sum_transformed.val[1] = wrapper::vlog(vec_sum.val[1]);
+                        vec_sum_transformed.val[2] = wrapper::vlog(vec_sum.val[2]);
+                        vec_sum_transformed.val[3] = wrapper::vlog(vec_sum.val[3]);
+                    }
+                }
+                else
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        const T *const base_ptr_in  = (i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr);
+                        auto           vec_elements = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+                        //vec_els is functionally redundant but is needed as a workaround for a toolchain bug.
+                        std::vector<T> vec_els(16);
+
+                        for (int k = 0; k < num_remaining_full; ++k)
+                        {
+                            for (int j = 0; j < 4; ++j)
+                            {
+                                vec_els[k * 4 + j] = *(base_ptr_in + (4 * k + j));
+                            }
+                        }
+                        for (int j = 0; j < num_remaining_partial; ++j)
+                        {
+                            vec_els[num_remaining_full * 4 + j] = *(base_ptr_in + (4 * num_remaining_full + j));
+                        }
+                        for (int q = 0; q < 16; q++)
+                        {
+                            vec_elements[q] = vec_els[q];
+                        }
+                        vec_elements                   = wrapper::vqsub(vec_max, vec_elements);
+                        float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+                        if (IS_LOG)
+                        {
+                            vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+                            vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+                            vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+                            vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+                            vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+                            vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+                            vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+                            vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+                        }
+                        else
+                        {
+                            vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+                            vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+                            vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+                            vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+                            vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+                            vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+                            vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+                            vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+                        }
+
+                        float *const base_ptr_tmp = (i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr);
+                        for (int k = 0; k < num_remaining_full; ++k)
+                        {
+                            for (int j = 0; j < 4; ++j)
+                            {
+                                *(base_ptr_tmp + (4 * k + j)) = vec_elements_flt.val[k][j];
+                            }
+                        }
+
+                        for (int j = 0; j < num_remaining_partial; ++j)
+                        {
+                            *(base_ptr_tmp + (4 * num_remaining_full + j)) =
+                                vec_elements_flt.val[num_remaining_full][j];
+                        }
+                    }
+
+                    auto vec_256 = wrapper::vdup_n(static_cast<float32_t>(256), ExactTagType{});
+                    if (!IS_LOG)
+                    {
+                        vec_sum_transformed.val[0] = wrapper::vdiv(vec_256, vec_sum.val[0]);
+                        vec_sum_transformed.val[1] = wrapper::vdiv(vec_256, vec_sum.val[1]);
+                        vec_sum_transformed.val[2] = wrapper::vdiv(vec_256, vec_sum.val[2]);
+                        vec_sum_transformed.val[3] = wrapper::vdiv(vec_256, vec_sum.val[3]);
+                    }
+                    else
+                    {
+                        vec_sum_transformed.val[0] = wrapper::vlog(vec_sum.val[0]);
+                        vec_sum_transformed.val[1] = wrapper::vlog(vec_sum.val[1]);
+                        vec_sum_transformed.val[2] = wrapper::vlog(vec_sum.val[2]);
+                        vec_sum_transformed.val[3] = wrapper::vlog(vec_sum.val[3]);
+                    }
+                }
+            } // Compute exponentials and sum
+
+            /* Normalize exponentials */
+            {
+                constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+                if (!vector_exceeds_bounds)
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
+                        float32x4x4_t vec_in = vld4q_f32((i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr));
+
+                        int_vec_type normalized_value{};
+
+                        if (IS_LOG)
+                        {
+                            const float32x4x4_t sub = {
+                                vsubq_f32(vec_in.val[0], vec_sum_transformed.val[0]),
+                                vsubq_f32(vec_in.val[1], vec_sum_transformed.val[1]),
+                                vsubq_f32(vec_in.val[2], vec_sum_transformed.val[2]),
+                                vsubq_f32(vec_in.val[3], vec_sum_transformed.val[3]),
+                            };
+                            normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+                        }
+                        else
+                        {
+                            float32x4x4_t mul = {
+                                vmulq_f32(vec_in.val[0], vec_sum_transformed.val[0]),
+                                vmulq_f32(vec_in.val[1], vec_sum_transformed.val[1]),
+                                vmulq_f32(vec_in.val[2], vec_sum_transformed.val[2]),
+                                vmulq_f32(vec_in.val[3], vec_sum_transformed.val[3]),
+                            };
+
+                            if (is_qasymm8_signed)
+                            {
+                                const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+                                mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
+                                mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
+                                mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
+                                mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                            }
+
+                            normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+                        }
+                        wrapper::vstore((i * out_axis_stride) + reinterpret_cast<T *>(out_ptr), normalized_value);
+                    }
+                }
+                else
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        T *const     base_ptr_out = (i * out_axis_stride) + reinterpret_cast<T *>(out_ptr);
+                        float *const base_ptr_tmp = (i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr);
+                        if (IS_LOG)
+                        {
+                            for (int k = 0; k < num_remaining_full; ++k)
+                            {
+                                for (int j = 0; j < 4; ++j)
+                                {
+                                    *(base_ptr_out + (4 * k + j)) = utils::cast::saturate_cast<T>(
+                                        (*(base_ptr_tmp + (4 * k + j)) - vec_sum_transformed.val[k][j]));
+                                }
+                            }
+                            for (int j = 0; j < num_remaining_partial; ++j)
+                            {
+                                *(base_ptr_out + (4 * num_remaining_full + j)) =
+                                    utils::cast::saturate_cast<T>(*(base_ptr_tmp + (4 * num_remaining_full + j)) -
+                                                                  vec_sum_transformed.val[num_remaining_full][j]);
+                            }
+                        }
+                        else
+                        {
+                            for (int k = 0; k < num_remaining_full; ++k)
+                            {
+                                for (int j = 0; j < 4; ++j)
+                                {
+                                    *(base_ptr_out + (4 * k + j)) = utils::cast::saturate_cast<T>(
+                                        *(base_ptr_tmp + (4 * k + j)) * vec_sum_transformed.val[k][j] -
+                                        (is_qasymm8_signed ? 128.f : 0));
+                                }
+                            }
+                            for (int j = 0; j < num_remaining_partial; ++j)
+                            {
+                                *(base_ptr_out + (4 * num_remaining_full + j)) =
+                                    utils::cast::saturate_cast<T>(*(base_ptr_tmp + (4 * num_remaining_full + j)) *
+                                                                      vec_sum_transformed.val[num_remaining_full][j] -
+                                                                  (is_qasymm8_signed ? 128.f : 0));
+                            }
+                        }
+                    }
+                }
+            } // Normalize exponentials
+        },
+        in_it, out_it);
+}
+
+template void neon_softmax_x_quantized<qasymm8_signed_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template void neon_softmax_x_quantized<qasymm8_signed_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template void neon_softmax_x_quantized<qasymm8_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template void neon_softmax_x_quantized<qasymm8_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template void neon_softmax_non_x_quantized<qasymm8_signed_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template void neon_softmax_non_x_quantized<qasymm8_signed_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template void neon_softmax_non_x_quantized<qasymm8_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template void neon_softmax_non_x_quantized<qasymm8_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
new file mode 100644
index 0000000000..e417271d0e
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#ifdef __aarch64__
+namespace
+{
+// These helper functions are added because vaddv does not exist for fp16,
+// and, therefore, is not part of the wrapper::vaddv interface.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages)
+{
+    auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a));
+    for (int i = 0; i < sum_stages; ++i)
+    {
+        sum_res = wrapper::vpadd(sum_res, sum_res);
+    }
+    return wrapper::vgetlane(sum_res, 0);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+inline float wrapper_vaddv(const float32x4_t &a, int sum_stages)
+{
+    ARM_COMPUTE_UNUSED(sum_stages);
+    return wrapper::vaddv(a);
+}
+} // namespace
+#endif // __aarch64__
+
+// The template implementation for float data types is stored in the header file because
+// we need all fp16 instantiated code to live in fp16.cpp files.
+template <typename T, bool IS_LOG>
+void neon_softmax_x_float(const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(axis);
+    ARM_COMPUTE_UNUSED(tmp);
+
+    const int input_width = in->info()->valid_region().shape.x();
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
+
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    constexpr int vec_size = 16 / sizeof(T);
+
+    const int sum_stages = log2(vec_size >> 1);
+
+    const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
+
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
+
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // compute max
+
+            T sum_transformed{};
+
+            /* Compute exponentials and sum */
+            {
+                /* Get max value */
+                const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
+
+                /* Init sum to zero */
+                auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    auto vec_elements = wrapper::vloadq(in_ptr + x);
+                    vec_elements      = wrapper::vsub(vec_elements, vec_max);
+                    if (IS_LOG)
+                    {
+                        vec_elements = wrapper::vmul(vec_elements, beta_vec);
+                        vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                    }
+                    else
+                    {
+                        vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));
+                        vec_sum      = wrapper::vadd(vec_sum, vec_elements);
+                    }
+                    wrapper::vstore(out_ptr + x, vec_elements);
+                }
+
+                /* Reduce sum */
+                T sum{};
+#ifdef __aarch64__
+                sum = wrapper_vaddv(vec_sum, sum_stages);
+#else  // __aarch64__
+                auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    sum_res = wrapper::vpadd(sum_res, sum_res);
+                }
+                sum = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
+
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    T element{};
+
+                    if (IS_LOG)
+                    {
+                        element = (in_ptr[x] - max_val) * beta;
+                        sum += std::exp(element);
+                    }
+                    else
+                    {
+                        element = std::exp((in_ptr[x] - max_val) * beta);
+                        sum += element;
+                    }
+
+                    out_ptr[x] = element;
+                }
+
+                if (!IS_LOG)
+                {
+                    sum_transformed = T(1) / sum;
+                }
+                else
+                {
+                    sum_transformed = static_cast<T>(std::log(sum));
+                }
+            } // Compute exponentials and sum
+
+            /* Normalize exponentials */
+            {
+                const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{});
+
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto vec_in = wrapper::vloadq(out_ptr + x);
+                    if (IS_LOG)
+                    {
+                        wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec));
+                    }
+                    else
+                    {
+                        wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec));
+                    }
+                }
+
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    if (IS_LOG)
+                    {
+                        out_ptr[x] = out_ptr[x] - sum_transformed;
+                    }
+                    else
+                    {
+                        out_ptr[x] = out_ptr[x] * sum_transformed;
+                    }
+                }
+            } // Normalize exponentials
+        },
+        in_it, out_it);
+}
+template <typename T, bool IS_LOG>
+void neon_softmax_non_x_float(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(tmp);
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
+
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const auto         beta_vec        = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
+    constexpr int      vec_size        = 16 / sizeof(T);
+    const ITensorInfo *in_info         = in->info();
+    const ITensorInfo *out_info        = out->info();
+    const int          x_width         = in_info->valid_region().shape.x();
+    const unsigned int in_axis_stride  = in_info->strides_in_bytes()[axis];
+    const unsigned int out_axis_stride = out_info->strides_in_bytes()[axis];
+    const int          axis_width      = in_info->dimension(axis);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &winCoords)
+        {
+            const bool vector_exceeds_bounds = (winCoords[0] + vec_size) > x_width;
+
+            /* Get pointers */
+            const uint8_t *in_ptr  = in_it.ptr();
+            uint8_t       *out_ptr = out_it.ptr();
+
+            // Init max value
+            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+
+            /* Compute Max */
+            {
+                if (!vector_exceeds_bounds)
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        const auto current_value =
+                            wrapper::vloadq(reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr));
+                        vec_max = wrapper::vmax(vec_max, current_value);
+                    }
+                }
+                else
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        const T *const base_ptr_in = reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr);
+                        int            j           = 0;
+                        for (; j < (x_width - winCoords[0]); ++j)
+                        {
+                            const auto current_value = *(base_ptr_in + j);
+                            vec_max[j]               = std::max(vec_max[j], current_value);
+                        }
+                    }
+                }
+            } // compute max
+
+            auto vec_sum_transformed = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+            auto vec_elements = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+            /* Init sum to zero */
+            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+            /* Compute exponentials and sum */
+            {
+                if (!vector_exceeds_bounds)
+                {
+                    const auto vec_one = wrapper::vdup_n(static_cast<T>(1), ExactTagType{});
+                    /* Loop over row and compute exponentials and sum */
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        vec_elements = wrapper::vloadq(reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr));
+                        vec_elements = wrapper::vsub(vec_elements, vec_max);
+                        if (IS_LOG)
+                        {
+                            vec_elements = wrapper::vmul(vec_elements, beta_vec);
+                            vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                        }
+                        else
+                        {
+                            vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));
+                            vec_sum      = wrapper::vadd(vec_sum, vec_elements);
+                        }
+
+                        wrapper::vstore(reinterpret_cast<T *>((i * out_axis_stride) + out_ptr), vec_elements);
+                    }
+
+                    if (!IS_LOG)
+                    {
+                        vec_sum_transformed = wrapper::vdiv(vec_one, vec_sum);
+                    }
+                    else
+                    {
+                        vec_sum_transformed = wrapper::vlog(vec_sum);
+                    }
+                }
+                else
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        const T *const base_ptr_in  = reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr);
+                        T *const       base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr);
+                        int            j            = 0;
+                        for (; j < (x_width - winCoords[0]); ++j)
+                        {
+                            vec_elements[j] = *(base_ptr_in + j);
+                            vec_elements[j] -= vec_max[j];
+                            if (IS_LOG)
+                            {
+                                vec_elements[j] *= beta;
+                                vec_sum[j] += std::exp(vec_elements[j]);
+                            }
+                            else
+                            {
+                                vec_elements[j] = std::exp(vec_elements[j] * beta);
+                                vec_sum[j] += vec_elements[j];
+                            }
+                            *(base_ptr_out + j) = vec_elements[j];
+                        }
+                    }
+                    int j = 0;
+                    for (; j < (x_width - winCoords[0]); ++j)
+                    {
+                        if (!IS_LOG)
+                        {
+                            vec_sum_transformed[j] = 1 / vec_sum[j];
+                        }
+                        else
+                        {
+                            vec_sum_transformed[j] = std::log(vec_sum[j]);
+                        }
+                    }
+                }
+            } // Compute exponentials and sum
+
+            /* Normalize exponentials */
+            {
+                if (!vector_exceeds_bounds)
+                {
+                    /* Loop over row and compute softmax */
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr);
+                        auto     vec_in       = wrapper::vloadq(base_ptr_out);
+                        if (IS_LOG)
+                        {
+                            wrapper::vstore(base_ptr_out, wrapper::vsub(vec_in, vec_sum_transformed));
+                        }
+                        else
+                        {
+                            wrapper::vstore(base_ptr_out, wrapper::vmul(vec_in, vec_sum_transformed));
+                        }
+                    }
+                }
+                else
+                {
+                    int i = 0;
+                    for (; i < axis_width; ++i)
+                    {
+                        T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr);
+                        int      j            = 0;
+                        for (; j < (x_width - winCoords[0]); ++j)
+                        {
+                            if (IS_LOG)
+                            {
+                                *(base_ptr_out + j) -= vec_sum_transformed[j];
+                            }
+                            else
+                            {
+                                *(base_ptr_out + j) *= vec_sum_transformed[j];
+                            }
+                        }
+                    }
+                }
+            } // Normalize exponentials
+        },
+        in_it, out_it);
+}
+template <typename T, bool IS_LOG>
+void neon_softmax_x_quantized(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+
+template <typename T, bool IS_LOG>
+void neon_softmax_non_x_quantized(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..369f9bb005
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <bool IS_LOG>
+void neon_qasymm8_softmax(const ITensor *in,
+                          void *const    tmp,
+                          ITensor       *out,
+                          const float    beta,
+                          int            axis,
+                          const Window  &window,
+                          const float   *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(lut_ptr);
+    if (axis == 0)
+    {
+        return neon_softmax_x_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+    else
+    {
+        return neon_softmax_non_x_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+}
+
+template void neon_qasymm8_softmax<true>(const ITensor *in,
+                                         void *const    tmp,
+                                         ITensor       *out,
+                                         const float    beta,
+                                         int            axis,
+                                         const Window  &window,
+                                         const float   *lut_ptr);
+template void neon_qasymm8_softmax<false>(const ITensor *in,
+                                          void *const    tmp,
+                                          ITensor       *out,
+                                          const float    beta,
+                                          int            axis,
+                                          const Window  &window,
+                                          const float   *lut_ptr);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..594ceb7654
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <bool IS_LOG>
+void neon_qasymm8_signed_softmax(const ITensor *in,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 int            axis,
+                                 const Window  &window,
+                                 const float   *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(lut_ptr);
+    if (axis == 0)
+    {
+        return neon_softmax_x_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+    else
+    {
+        return neon_softmax_non_x_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, axis, window);
+    }
+}
+
+template void neon_qasymm8_signed_softmax<true>(const ITensor *in,
+                                                void *const    tmp,
+                                                ITensor       *out,
+                                                const float    beta,
+                                                int            axis,
+                                                const Window  &window,
+                                                const float   *lut_ptr);
+template void neon_qasymm8_signed_softmax<false>(const ITensor *in,
+                                                 void *const    tmp,
+                                                 ITensor       *out,
+                                                 const float    beta,
+                                                 int            axis,
+                                                 const Window  &window,
+                                                 const float   *lut_ptr);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sme2/fp16.cpp b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp
new file mode 100644
index 0000000000..e70c9f4793
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+// SoftMax
+//
+// Steps:
+//   * Find max:   max_value = max(src)
+//   * Regularize: dst[i] = exp(src[i] - max_value)
+//                 sum_value = sum(dst)
+//   * Normalize:  dst[i] = dst[i] / sum_value
+void sme2_f16_softmax_kernel( //
+    const float16_t *src,
+    float16_t       *dst,
+    float            beta,
+    const uintptr_t  shape[4],
+    const uintptr_t  src_strides[4],
+    const uintptr_t  dst_strides[4])
+{
+    __asm__ volatile(
+        R"(
+            .inst 0xd503477f  // smstart
+
+            // Registers
+            //
+            //   *  x9: temporary, index
+            //   * x10: temporary, -inf
+            //   * x11: temporary, 0
+            //   * x12: temporary, 1.0f
+            //   * x13: temporary, body_length
+            //
+            //   * x20: index_3
+            //   * x21: src_3
+            //   * x22: dst_3
+            //   * x23: index_2
+            //   * x24: src_2
+            //   * x25: dst_2
+            //   * x26: index_1
+            //   * x27: src_1
+            //   * x28: dst_1
+            //
+            //   *  z0: c1
+            //   *  z1: c2
+            //   *  z2: c3
+            //   *  z3: c4
+            //   *  z4: c5
+            //   *  z5: shift
+            //   *  z6: inv_ln2
+            //   *  z7: neg_ln2_hi
+            //   *  z8: neg_ln2_lo
+            //   *  z9: min_input
+            //   * z10: 23, 0
+            //   * z11: max_value
+            //   * z12-z15: x, x_fp32_lower_halves, r_hi, r, r2
+            //   * z16-z19: max_value, shift, z, scale, poly
+            //   * z20-z21: n, p1, p12345
+            //   * z22-z23: n, p23, p2345
+            //   * z24-z25: p45
+            //   * z26: beta
+            //   * z28-z31: sum_value, x_fp32_upper_halves
+            //
+            //   * za0-za3: sum_value
+            //
+            //   * p0: all-true
+            //   * p1: left-over predicate for find-max & normalize loops
+            //   * p2-p4: left-over predicates for regularize loop
+            //   * p4-p7: underflow in vector loop
+            //   * p5-p6: underflow in leftover loop
+            //   *
+            //   * pn9: all-true
+
+            // Prepares all constant values
+
+            ptrue p0.b
+            .inst 0x25207811  // ptrue pn9.b
+
+            mov  w9, #0xfff6  // c1: 0x1.ffffecp-1f = 0x3f7ffff6
+            mov w10, #0xfedb  // c2: 0x1.fffdb6p-2f = 0x3efffedb
+            mov w11, #0xaf33  // c3: 0x1.555e66p-3f = 0x3e2aaf33
+            mov w12, #0x9f17  // c4: 0x1.573e2ep-5f = 0x3d2b9f17
+            mov w13, #0x2010  // c5: 0x1.0e4020p-7f = 0x3c072010
+
+            movk  w9, #0x3f7f, LSL #16  // c1: 0x1.ffffecp-1f = 0x3f7ffff6
+            movk w10, #0x3eff, LSL #16  // c2: 0x1.fffdb6p-2f = 0x3efffedb
+            movk x11, #0x3e2a, LSL #16  // c3: 0x1.555e66p-3f = 0x3e2aaf33
+            movk w12, #0x3d2b, LSL #16  // c4: 0x1.573e2ep-5f = 0x3d2b9f17
+            movk w13, #0x3c07, LSL #16  // c5: 0x1.0e4020p-7f = 0x3c072010
+
+            dup z0.s, w9   // c1.
+            dup z1.s, w10  // c2.
+            dup z2.s, w11  // c3.
+            dup z3.s, w12  // c4.
+            dup z4.s, w13  // c5.
+
+            mov  w9, #0x007f  // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f
+            mov w10, #0xaa3b  // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b
+            mov w11, #0x7200  // neg_ln2_hi: -ln(2) from bits  -1 to -19 = -0x1.62e400p-1f = 0xbf317200
+            mov w12, #0xbe8e  // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e
+            mov w13, #0x47ae  // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae
+
+            movk  w9, #0x4b00, LSL #16  // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f
+            movk w10, #0x3fb8, LSL #16  // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b
+            movk w11, #0xbf31, LSL #16  // neg_ln2_hi: -ln(2) from bits  -1 to -19 = -0x1.62e400p-1f = 0xbf317200
+            movk w12, #0xb5bf, LSL #16  // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e
+            movk w13, #0xc2ad, LSL #16  // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae
+
+            dup z5.s, w9   // shift
+            dup z6.s, w10  // inv_ln2
+            dup z7.s, w11  // neg_ln2_hi
+            dup z8.s, w12  // neg_ln2_lo
+            dup z9.s, w13  // min_input
+
+            dup z26.s, %w[beta]  // beta
+            fcvt h26, s26
+            dup z26.h, z26.h[0]
+
+            mov w10, #0xfc00  // -inf: 0xfc00 for fp16
+
+            mov w11, #0  // 0
+
+            // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl
+            cnth x13, ALL, MUL #4
+            udiv x9, %x[length], x13
+            mul x13, x13, x9
+
+            // ==================================================
+            // 3D loop opening
+            // ==================================================
+
+            mov x20, %x[shape_3]
+            mov x21, %x[src]
+            mov x22, %x[dst]
+
+loop_3_start%=:
+            // for index_3 in shape_3 downto 1
+            cmp x20, #0
+            b.eq loop_3_end%=
+            sub x20, x20, #1
+
+            mov x23, %x[shape_2]
+            mov x24, x21
+            mov x25, x22
+
+loop_2_start%=:
+            // for index_2 in shape_2 downto 1
+            cmp x23, #0
+            b.eq loop_2_end%=
+            sub x23, x23, #1
+
+            mov x26, %x[shape_1]
+            mov x27, x24
+            mov x28, x25
+
+loop_1_start%=:
+            // for index_1 in shape_2 downto 1
+            cmp x26, #0
+            b.eq loop_1_end%=
+            sub x26, x26, #1
+
+            // ==================================================
+            // Step 1: Find max
+            // ==================================================
+
+            // ---------------------------------------------------------------- z16-z19: max_value = -inf
+            dup z16.h, w10
+            dup z17.h, w10
+            dup z18.h, w10
+            dup z19.h, w10
+
+            // Loop for processing 4 vectors per iteration.
+            mov x9, #0                                                         // x9: index
+            dup z11.h, w10                                                     // z11: max_value = -inf
+
+find_max_body_start%=:
+            cmp x9, x13
+            b.eq find_max_body_end%=
+
+            .inst 0xa009a76c  // ld1h {z12.h-z15.h}, pn9/z, [x27, x9, LSL #1]      // z12-z15: x
+            .inst 0xc16cb910  // fmax {z16.h-z19.h}, {z16.h-z19.h}, {z12.h-z15.h}  // z16-z19: max_value = max(max_value, x)
+
+            inch x9, ALL, MUL #4
+            b find_max_body_start%=
+find_max_body_end%=:
+
+            // Loop for processing the leftover part.
+find_max_leftover_start%=:
+            whilelo p1.h, x9, %x[length]
+            b.none find_max_leftover_end%=
+
+            ld1h z12.h, p1/z, [x27, x9, LSL #1]                                // z12: x
+            fmax z16.h, p1/m, z16.h, z12.h                                     // z16: max_value = max(max_value, x)
+
+            inch x9
+            b find_max_leftover_start%=
+find_max_leftover_end%=:
+
+            // ---------------------------------------------------------------- z16: max_value
+            .inst 0xc172b110  // fmax {z16.h-z17.h}, {z16.h-z17.h}, {z18.s-z19.h}
+            fmax z16.h, p0/m, z16.h, z17.h
+            fmaxv h16, p0, z16.h
+
+            // ---------------------------------------------------------------- z11: max_value
+            dup z11.h, z16.h[0]
+
+            // ==================================================
+            // Step 2: Regularize, i.e. Calculate exp(x - max(x)
+            // ==================================================
+
+            .inst 0xc00800ff  // zero {za0.s, za1.s, za2.s, za3.s}              za0-za3: sum_value (in fp32)
+
+            // Loop for processing 4 vectors per iteration.
+            mov x9, #0  // ---------------------------------------------------- x9: index
+
+regularize_body_start%=:
+            cmp x9, x13
+            b.eq regularize_body_end%=
+
+            // Loads the input data to 4 consecutive registers ---------------- z12-z15: input_data
+            .inst 0xa009a76c  // ld1h {z12.h-z15.h}, pn9/z, [x27, x9, LSL #1]      // z12-z15: x
+
+            // ---------------------------------------------------------------- z12-z15: x = input_data - max_value
+            fsub z12.h, z12.h, z11.h
+            fsub z13.h, z13.h, z11.h
+            fsub z14.h, z14.h, z11.h
+            fsub z15.h, z15.h, z11.h
+
+            // ---------------------------------------------------------------- z12-z15: x = (input_data - max_value) * beta
+            fmul z12.h, z12.h, z26.h
+            fmul z13.h, z13.h, z26.h
+            fmul z14.h, z14.h, z26.h
+            fmul z15.h, z15.h, z26.h
+
+            // ----------------------------------------------------------------
+            // Convert fp16 values to fp32. This results in four more registers.
+            // z12 --> z12, z28
+            fcvtlt z28.s, p0/m, z12.h
+            fcvt z12.s, p0/m, z12.h
+
+            // z13 --> z13, z29
+            fcvtlt z29.s, p0/m, z13.h
+            fcvt z13.s, p0/m, z13.h
+
+            // z14 --> z14, z30
+            fcvtlt z30.s, p0/m, z14.h
+            fcvt z14.s, p0/m, z14.h
+
+            // z15 --> z15, z31
+            fcvtlt z31.s, p0/m, z15.h
+            fcvt z15.s, p0/m, z15.h
+
+            // ----------------------------------------------------------------
+            //                         Process z12-z15
+            // ----------------------------------------------------------------
+            // ---------------------------------------------------------------- z16-z19: shift
+            mov z16.d, z5.d
+            mov z17.d, z5.d
+            mov z18.d, z5.d
+            mov z19.d, z5.d
+
+            // ---------------------------------------------------------------- p4-p7: underflow = x < min_input
+            fcmlt p4.s, p0/z, z12.s, z9.s
+            fcmlt p5.s, p0/z, z13.s, z9.s
+            fcmlt p6.s, p0/z, z14.s, z9.s
+            fcmlt p7.s, p0/z, z15.s, z9.s
+
+            // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2
+            fmla z16.s, p0/m, z12.s, z6.s
+            fmla z17.s, p0/m, z13.s, z6.s
+            fmla z18.s, p0/m, z14.s, z6.s
+            fmla z19.s, p0/m, z15.s, z6.s
+
+            // ---------------------------------------------------------------- z20-z23: n = z - shift
+            fsub z20.s, z16.s, z5.s
+            fsub z21.s, z17.s, z5.s
+            fsub z22.s, z18.s, z5.s
+            fsub z23.s, z19.s, z5.s
+
+            // ---------------------------------------------------------------- z12-z15: r_hi = x + n * neg_ln2_hi
+            fmla z12.s, p0/m, z20.s, z7.s
+            fmla z13.s, p0/m, z21.s, z7.s
+            fmla z14.s, p0/m, z22.s, z7.s
+            fmla z15.s, p0/m, z23.s, z7.s
+
+            // ---------------------------------------------------------------- z12-z15: r = r_hi + n * neg_ln2_lo
+            fmla z12.s, p0/m, z20.s, z8.s
+            fmla z13.s, p0/m, z21.s, z8.s
+            fmla z14.s, p0/m, z22.s, z8.s
+            fmla z15.s, p0/m, z23.s, z8.s
+
+            // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n)
+            dup z10.s, #23
+            urshl z16.s, p0/m, z16.s, z10.s
+            urshl z17.s, p0/m, z17.s, z10.s
+            urshl z18.s, p0/m, z18.s, z10.s
+            urshl z19.s, p0/m, z19.s, z10.s
+
+            // Processes the first 2 vectors. (z12-z13)
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z12.s, z0.s
+            fmul z21.s, z13.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z12.s, z2.s
+            fmla z23.s, p0/m, z13.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z35: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z12.s, z4.s
+            fmla z25.s, p0/m, z13.s, z4.s
+
+            // ---------------------------------------------------------------- z12-z13: r2 = r * r
+            fmul z12.s, z12.s, z12.s
+            fmul z13.s, z13.s, z13.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z12.s, z24.s
+            fmla z23.s, p0/m, z13.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z12.s, z22.s
+            fmla z21.s, p0/m, z13.s, z23.s
+
+            // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale
+            fmla z16.s, p0/m, z20.s, z16.s
+            fmla z17.s, p0/m, z21.s, z17.s
+
+            // Processes the last 2 vectors (z14-z15)
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z14.s, z0.s
+            fmul z21.s, z15.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z14.s, z2.s
+            fmla z23.s, p0/m, z15.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z35: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z14.s, z4.s
+            fmla z25.s, p0/m, z15.s, z4.s
+
+            // ---------------------------------------------------------------- z14-z15: r2 = r * r
+            fmul z14.s, z14.s, z14.s
+            fmul z15.s, z15.s, z15.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z14.s, z24.s
+            fmla z23.s, p0/m, z15.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z14.s, z22.s
+            fmla z21.s, p0/m, z15.s, z23.s
+
+            // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale
+            fmla z18.s, p0/m, z20.s, z18.s
+            fmla z19.s, p0/m, z21.s, z19.s
+
+            // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly
+            dup z10.s, #0
+            sel z12.s, p4, z10.s, z16.s
+            sel z13.s, p5, z10.s, z17.s
+            sel z14.s, p6, z10.s, z18.s
+            sel z15.s, p7, z10.s, z19.s
+
+            // ---------------------------------------------------------------- sum in fp32
+            .inst 0xc1a17d80  // fadd za.s[w11, #0, VGx4], {z12.s-z15.s}        za0-za3: sum_value = sum_value + poly
+
+            // ----------------------------------------------------------------
+            //                         Process z28-z31
+            // ----------------------------------------------------------------
+            // ---------------------------------------------------------------- z16-z19: shift
+            mov z16.d, z5.d
+            mov z17.d, z5.d
+            mov z18.d, z5.d
+            mov z19.d, z5.d
+
+            // ---------------------------------------------------------------- p4-p7: underflow = x < min_input
+            fcmlt p4.s, p0/z, z28.s, z9.s
+            fcmlt p5.s, p0/z, z29.s, z9.s
+            fcmlt p6.s, p0/z, z30.s, z9.s
+            fcmlt p7.s, p0/z, z31.s, z9.s
+
+            // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2
+            fmla z16.s, p0/m, z28.s, z6.s
+            fmla z17.s, p0/m, z29.s, z6.s
+            fmla z18.s, p0/m, z30.s, z6.s
+            fmla z19.s, p0/m, z31.s, z6.s
+
+            // ---------------------------------------------------------------- z20-z23: n = z - shift
+            fsub z20.s, z16.s, z5.s
+            fsub z21.s, z17.s, z5.s
+            fsub z22.s, z18.s, z5.s
+            fsub z23.s, z19.s, z5.s
+
+            // ---------------------------------------------------------------- z24-z27: r_hi = x + n * neg_ln2_hi
+            fmla z28.s, p0/m, z20.s, z7.s
+            fmla z29.s, p0/m, z21.s, z7.s
+            fmla z30.s, p0/m, z22.s, z7.s
+            fmla z31.s, p0/m, z23.s, z7.s
+
+            // ---------------------------------------------------------------- z27-z30: r = r_hi + n * neg_ln2_lo
+            fmla z28.s, p0/m, z20.s, z8.s
+            fmla z29.s, p0/m, z21.s, z8.s
+            fmla z30.s, p0/m, z22.s, z8.s
+            fmla z31.s, p0/m, z23.s, z8.s
+
+            // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n)
+            dup z10.s, #23
+            urshl z16.s, p0/m, z16.s, z10.s
+            urshl z17.s, p0/m, z17.s, z10.s
+            urshl z18.s, p0/m, z18.s, z10.s
+            urshl z19.s, p0/m, z19.s, z10.s
+
+            // Processes the first 2 vectors. (z28-z29)
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z28.s, z0.s
+            fmul z21.s, z29.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z28.s, z2.s
+            fmla z23.s, p0/m, z29.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z25: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z28.s, z4.s
+            fmla z25.s, p0/m, z29.s, z4.s
+
+            // ---------------------------------------------------------------- z28-z29: r2 = r * r
+            fmul z28.s, z28.s, z28.s
+            fmul z29.s, z29.s, z29.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z28.s, z24.s
+            fmla z23.s, p0/m, z29.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z28.s, z22.s
+            fmla z21.s, p0/m, z29.s, z23.s
+
+            // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale
+            fmla z16.s, p0/m, z20.s, z16.s
+            fmla z17.s, p0/m, z21.s, z17.s
+
+            // Processes the last 2 vectors (z30-z31)
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z30.s, z0.s
+            fmul z21.s, z31.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z30.s, z2.s
+            fmla z23.s, p0/m, z31.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z35: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z30.s, z4.s
+            fmla z25.s, p0/m, z31.s, z4.s
+
+            // ---------------------------------------------------------------- z30-z31: r2 = r * r
+            fmul z30.s, z30.s, z30.s
+            fmul z31.s, z31.s, z31.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z30.s, z24.s
+            fmla z23.s, p0/m, z31.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z30.s, z22.s
+            fmla z21.s, p0/m, z31.s, z23.s
+
+            // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale
+            fmla z18.s, p0/m, z20.s, z18.s
+            fmla z19.s, p0/m, z21.s, z19.s
+
+            // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly
+            dup z10.s, #0
+            sel z28.s, p4, z10.s, z16.s
+            sel z29.s, p5, z10.s, z17.s
+            sel z30.s, p6, z10.s, z18.s
+            sel z31.s, p7, z10.s, z19.s
+
+            // ---------------------------------------------------------------- sum in fp32
+            .inst 0xc1a17f80  // fadd za.s[w11, #0, VGx4], {z28.s-z31.s}        za0-za3: sum_value = sum_value + poly
+
+            fcvt z12.h, p0/m, z12.s
+            fcvtnt z12.h, p0/m, z28.s
+
+            fcvt z13.h, p0/m, z13.s
+            fcvtnt z13.h, p0/m, z29.s
+
+            fcvt z14.h, p0/m, z14.s
+            fcvtnt z14.h, p0/m, z30.s
+
+            fcvt z15.h, p0/m, z15.s
+            fcvtnt z15.h, p0/m, z31.s
+
+            // Stores 4 consecutive registers to the output
+            .inst 0xa029a78c  // st1h {z12.h-z15.h}, pn9, [x28, x9, LSL #1]
+
+            inch x9, ALL, MUL #4
+            b regularize_body_start%=
+regularize_body_end%=:
+
+            // ---------------------------------------------------------------- z28: sum_value
+            .inst 0xc0066c1c  // mova {z28.s-z31.s}, za.s[w11, #0, VGx4]
+            fadd z28.s, z28.s, z29.s
+            fadd z30.s, z30.s, z31.s
+            fadd z28.s, z28.s, z30.s
+
+            // Loop for processing the leftover part.
+regularize_leftover_start%=:
+            whilelo p2.h, x9, %x[length]
+            b.none regularize_leftover_end%=
+
+            ld1h z12.h, p2/z, [x27, x9, LSL #1]                                // x12: input_data
+
+            fsub z12.h, z12.h, z11.h                                           // z12: x = input_data - max_value
+            fmul z12.h, z12.h, z26.h                                           // z12: x = (input_data - max_value) * beta
+
+            // ---------------------------------------------------------------- z12.h --> z12.s, z13.s
+            fcvtlt z13.s, p2/m, z12.h
+            fcvt z12.s, p2/m, z12.h
+
+            // ---------------------------------------------------------------- p3, p4: predicates for z12, z14
+            pfalse p1.b
+            trn1 p3.h, p2.h, p1.h       // for z12
+            trn2 p4.h, p2.h, p1.h       // for z13
+
+            mov z16.d, z5.d                                                    // z16: shift
+            mov z17.d, z5.d                                                    // z17: shift
+            fcmlt p5.s, p3/z, z12.s, z9.s                                      // p5: underflow = x < min_input
+            fcmlt p6.s, p4/z, z13.s, z9.s                                      // p6: underflow = x < min_input
+            fmla z16.s, p3/m, z12.s, z6.s                                      // z16: z = shift + x * inv_ln2
+            fmla z17.s, p4/m, z13.s, z6.s                                      // z17: z = shift + x * inv_ln2
+            fsub z20.s, z16.s, z5.s                                            // z20: n = z - shift
+            fsub z21.s, z17.s, z5.s                                            // z21: n = z - shift
+            fmla z12.s, p3/m, z20.s, z7.s                                      // z12: r_hi = x + n * neg_ln2_hi
+            fmla z13.s, p4/m, z21.s, z7.s                                      // z13: r_hi = x + n * neg_ln2_hi
+            fmla z12.s, p3/m, z20.s, z8.s                                      // z12: r = r_hi + n * neg_ln2_lo
+            fmla z13.s, p4/m, z21.s, z8.s                                      // z13: r = r_hi + n * neg_ln2_lo
+            dup z10.s, #23                                                     // z10: 23
+            urshl z16.s, p3/m, z16.s, z10.s                                    // z16: scale = z << 23 (2^n)
+            urshl z17.s, p4/m, z17.s, z10.s                                    // z17: scale = z << 23 (2^n)
+            fmul z20.s, z12.s, z0.s                                            // z20: p1 = r * c1
+            fmul z21.s, z13.s, z0.s                                            // z21: p1 = r * c1
+            mov z22.d, z1.d                                                    // z22: p23 = c2
+            mov z23.d, z1.d                                                    // z23: p23 = c2
+            fmla z22.s, p3/m, z12.s, z2.s                                      // z22: p23 = c2 + r * c3
+            fmla z23.s, p4/m, z13.s, z2.s                                      // z23: p23 = c2 + r * c3
+            mov z24.d, z3.d                                                    // z24: c4
+            mov z25.d, z3.d                                                    // z25: c4
+            fmla z24.s, p3/m, z12.s, z4.s                                      // z24: p45 = c4 + r * c5
+            fmla z25.s, p4/m, z13.s, z4.s                                      // z25: p45 = c4 + r * c5
+            fmul z12.s, z12.s, z12.s                                           // z12: r2 = r * r
+            fmul z13.s, z13.s, z13.s                                           // z13: r2 = r * r
+            fmla z22.s, p3/m, z12.s, z24.s                                     // z22: p2345 = p23 + r2 * p45
+            fmla z23.s, p4/m, z13.s, z25.s                                     // z23: p2345 = p23 + r2 * p45
+            fmla z20.s, p3/m, z12.s, z22.s                                     // z20: p12345 = p1 + r2 * p2345
+            fmla z21.s, p4/m, z13.s, z23.s                                     // z21: p12345 = p1 + r2 * p2345
+            fmla z16.s, p3/m, z20.s, z16.s                                     // z16: poly = scale + p12345 * scale
+            fmla z17.s, p4/m, z21.s, z17.s                                     // z17: poly = scale + p12345 * scale
+            dup z10.s, #0                                                      // z10: 0
+            sel z16.s, p5, z10.s, z16.s                                        // z16: poly = underflow ? 0 : poly
+            sel z17.s, p6, z10.s, z17.s                                        // z17: poly = underflow ? 0 : poly
+            fadd z28.s, p3/m, z28.s, z16.s                                     // z28: sum_value = sum_value + poly
+            fadd z28.s, p4/m, z28.s, z17.s                                     // z28: sum_value = sum_value + poly
+
+            fcvt z16.h, p3/m, z16.s
+            fcvtnt z16.h, p4/m, z17.s
+            st1h z16.h, p2, [x28, x9, LSL #1]
+
+            inch x9
+            b regularize_leftover_start%=
+regularize_leftover_end%=:
+
+            // ==================================================
+            // Step 3: Normalize
+            // ==================================================
+
+            // ---------------------------------------------------------------- z28: inv_sum_value = 1 / sum_value
+            faddv s28, p0, z28.s
+            fmov s29, #1.0  // 1.0f
+            fdiv s28, s29, s28
+            fcvt h28, s28
+
+            dup z28.h, z28.h[0]
+
+            // Loop for processing 4 vectors per iteration.
+            mov x9, #0                                                         // x9: index
+
+normalize_body_start%=:
+            cmp x9, x13
+            b.eq normalize_body_end%=
+
+            .inst 0xa009a78c  // ld1h {z12.h-z15.h}, pn9/z, [x28, x9, LSL #1]
+
+            // ---------------------------------------------------------------- z12-z15: result = x * inv_sum_value
+            fmul z12.h, z12.h, z28.h
+            fmul z13.h, z13.h, z28.h
+            fmul z14.h, z14.h, z28.h
+            fmul z15.h, z15.h, z28.h
+
+            .inst 0xa029a78c  // st1h {z12.h-z15.h}, pn9, [x28, x9, LSL #1]
+
+            inch x9, ALL, MUL #4
+            b normalize_body_start%=
+normalize_body_end%=:
+
+            // Loop for processing the leftover part.
+normalize_leftover_start%=:
+            whilelo p1.h, x9, %x[length]
+            b.none normalize_leftover_end%=
+
+            ld1h z12.h, p1/z, [x28, x9, LSL #1]                                // z12: x
+            fmul z12.h, z12.h, z28.h                                           // z12: result = x * inv_sum_value
+
+            st1h z12.h, p1, [x28, x9, LSL #1]
+
+            inch x9
+            b normalize_leftover_start%=
+normalize_leftover_end%=:
+
+            // ==================================================
+            // 3D loop closing
+            // ==================================================
+
+            add x27, x27, %x[src_stride_1]
+            add x28, x28, %x[dst_stride_1]
+            b loop_1_start%=
+loop_1_end%=:
+
+            add x24, x24, %x[src_stride_2]
+            add x25, x25, %x[dst_stride_2]
+            b loop_2_start%=
+loop_2_end%=:
+
+            add x21, x21, %x[src_stride_3]
+            add x22, x22, %x[dst_stride_3]
+            b loop_3_start%=
+loop_3_end%=:
+
+            .inst 0xd503467f  // smstop
+        )"
+        :
+        : [src] "r"(src), [dst] "r"(dst), [beta] "r"(beta),                          //
+          [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]), //
+          [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]),
+          [src_stride_3] "r"(src_strides[3]), //
+          [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]),
+          [dst_stride_3] "r"(dst_strides[3]),                            //
+          [length] "r"(shape[0])                                         //
+        : "cc", "memory",                                                //
+          "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p9",          //
+          "x9", "x10", "x11", "x12", "x13", "x14",                       //
+          "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", //
+          "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",                //
+          "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",          //
+          "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",        //
+          "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"         //
+    );
+}
+
+void sme2_fp16_softmax(const ITensor *in,
+                       void *const,
+                       ITensor      *out,
+                       const float   beta,
+                       int           axis,
+                       const Window &window,
+                       const float  *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(lut_ptr);
+    ARM_COMPUTE_UNUSED(axis);
+
+    const auto *src_info = in->info();
+    const auto *dst_info = out->info();
+
+    const auto &full_shape  = dst_info->tensor_shape();
+    const auto &src_strides = src_info->strides_in_bytes();
+    const auto &dst_strides = dst_info->strides_in_bytes();
+
+    const uintptr_t k_shape[] = {
+        full_shape[0],
+        window.num_iterations(1),
+        window.num_iterations(2),
+        window.num_iterations(3),
+    };
+
+    const uintptr_t k_src_strides[] = {
+        src_strides[0],
+        src_strides[1],
+        src_strides[2],
+        src_strides[3],
+    };
+
+    const uintptr_t k_dst_strides[] = {
+        dst_strides[0],
+        dst_strides[1],
+        dst_strides[2],
+        dst_strides[3],
+    };
+
+    const uintptr_t k_src_offset = window[0].start() * src_strides[0] + //
+                                   window[1].start() * src_strides[1] + //
+                                   window[2].start() * src_strides[2] + //
+                                   window[3].start() * src_strides[3];
+
+    const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + //
+                                   window[1].start() * dst_strides[1] + //
+                                   window[2].start() * dst_strides[2] + //
+                                   window[3].start() * dst_strides[3];
+
+    const auto *k_src = reinterpret_cast<const float16_t *>(in->buffer() + k_src_offset);
+    auto       *k_dst = reinterpret_cast<float16_t *>(out->buffer() + k_dst_offset);
+
+    sme2_f16_softmax_kernel(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/cpu/kernels/softmax/generic/sme2/fp32.cpp b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp
new file mode 100644
index 0000000000..5e29d51746
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+// SoftMax
+//
+// Steps:
+//   * Find max:   max_value = max(src)
+//   * Regularize: dst[i] = exp(src[i] - max_value)
+//                 sum_value = sum(dst)
+//   * Normalize:  dst[i] = dst[i] / sum_value
+void sme2_f32_softmax_kernel( //
+    const float    *src,
+    float          *dst,
+    float           beta,
+    const uintptr_t shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t dst_strides[4])
+{
+    // Precondition:
+    //   * src_strides[0] == sizeof(float)
+    //   * dst_strides[0] == sizeof(float)
+
+    __asm__ volatile(
+        R"(
+            .inst 0xd503477f  // smstart
+
+            // Registers
+            //
+            //   *  x9: temporary, index
+            //   * x10: temporary, -inf
+            //   * x11: temporary, 0
+            //   * x12: temporary, 1.0f
+            //   * x13: temporary, body_length
+            //
+            //   * x20: index_3
+            //   * x21: src_3
+            //   * x22: dst_3
+            //   * x23: index_2
+            //   * x24: src_2
+            //   * x25: dst_2
+            //   * x26: index_1
+            //   * x27: src_1
+            //   * x28: dst_1
+            //
+            //   *  z0: c1
+            //   *  z1: c2
+            //   *  z2: c3
+            //   *  z3: c4
+            //   *  z4: c5
+            //   *  z5: shift
+            //   *  z6: inv_ln2
+            //   *  z7: neg_ln2_hi
+            //   *  z8: neg_ln2_lo
+            //   *  z9: min_input
+            //   * z10: 23, 0
+            //   * z11: max_value
+            //   * z12-z15: x, r_hi, r, r2
+            //   * z16-z19: max_value, shift, z, scale, poly
+            //   * z20-z21: n, p1, p12345
+            //   * z22-z23: n, p23, p2345
+            //   * z24-z25: p45
+            //   * z26: beta
+            //   * z28-z31: sum_value
+            //
+            //   * za0-za3: sum_value
+            //
+            //   * p0: all-true
+            //   * p1: left-over predicate
+            //   * p4-p7: underflow
+            //   * pn9: all-true
+
+            // Prepares all constant values
+
+            ptrue p0.b
+            .inst 0x25207811  // ptrue pn9.b
+
+            mov  w9, #0xfff6  // c1: 0x1.ffffecp-1f = 0x3f7ffff6
+            mov w10, #0xfedb  // c2: 0x1.fffdb6p-2f = 0x3efffedb
+            mov w11, #0xaf33  // c3: 0x1.555e66p-3f = 0x3e2aaf33
+            mov w12, #0x9f17  // c4: 0x1.573e2ep-5f = 0x3d2b9f17
+            mov w13, #0x2010  // c5: 0x1.0e4020p-7f = 0x3c072010
+
+            movk  w9, #0x3f7f, LSL #16  // c1: 0x1.ffffecp-1f = 0x3f7ffff6
+            movk w10, #0x3eff, LSL #16  // c2: 0x1.fffdb6p-2f = 0x3efffedb
+            movk x11, #0x3e2a, LSL #16  // c3: 0x1.555e66p-3f = 0x3e2aaf33
+            movk w12, #0x3d2b, LSL #16  // c4: 0x1.573e2ep-5f = 0x3d2b9f17
+            movk w13, #0x3c07, LSL #16  // c5: 0x1.0e4020p-7f = 0x3c072010
+
+            dup z0.s, w9   // c1.
+            dup z1.s, w10  // c2.
+            dup z2.s, w11  // c3.
+            dup z3.s, w12  // c4.
+            dup z4.s, w13  // c5.
+
+            mov  w9, #0x007f  // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f
+            mov w10, #0xaa3b  // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b
+            mov w11, #0x7200  // neg_ln2_hi: -ln(2) from bits  -1 to -19 = -0x1.62e400p-1f = 0xbf317200
+            mov w12, #0xbe8e  // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e
+            mov w13, #0x47ae  // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae
+
+            movk  w9, #0x4b00, LSL #16  // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f
+            movk w10, #0x3fb8, LSL #16  // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b
+            movk w11, #0xbf31, LSL #16  // neg_ln2_hi: -ln(2) from bits  -1 to -19 = -0x1.62e400p-1f = 0xbf317200
+            movk w12, #0xb5bf, LSL #16  // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e
+            movk w13, #0xc2ad, LSL #16  // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae
+
+            dup z5.s, w9   // shift
+            dup z6.s, w10  // inv_ln2
+            dup z7.s, w11  // neg_ln2_hi
+            dup z8.s, w12  // neg_ln2_lo
+            dup z9.s, w13  // min_input
+
+            dup z26.s, %w[beta]  // beta
+
+            mov w10, #0x0000  // -inf: 0xff800000
+            movk w10, #0xff80  // -inf: 0xff800000
+
+            mov w11, #0  // 0
+
+            // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl
+            cntw x13, ALL, MUL #4
+            udiv x9, %x[length], x13
+            mul x13, x13, x9
+
+            // ==================================================
+            // 3D loop opening
+            // ==================================================
+
+            mov x20, %x[shape_3]
+            mov x21, %x[src]
+            mov x22, %x[dst]
+
+loop_3_start%=:
+            // for index_3 in shape_3 downto 1
+            cmp x20, #0
+            b.eq loop_3_end%=
+            sub x20, x20, #1
+
+            mov x23, %x[shape_2]
+            mov x24, x21
+            mov x25, x22
+
+loop_2_start%=:
+            // for index_2 in shape_2 downto 1
+            cmp x23, #0
+            b.eq loop_2_end%=
+            sub x23, x23, #1
+
+            mov x26, %x[shape_1]
+            mov x27, x24
+            mov x28, x25
+
+loop_1_start%=:
+            // for index_1 in shape_2 downto 1
+            cmp x26, #0
+            b.eq loop_1_end%=
+            sub x26, x26, #1
+
+            // ==================================================
+            // Step 1: Find max
+            // ==================================================
+
+            // Loop for processing 4 vectors per iteration.
+            mov x9, #0                                                         // x9: index
+            dup z11.s, w10                                                     // z11: max_value = -inf
+
+            // ---------------------------------------------------------------- z16-z19: max_value = -inf
+            mov z16.d, z11.d
+            mov z17.d, z11.d
+            mov z18.d, z11.d
+            mov z19.d, z11.d
+
+find_max_body_start%=:
+            cmp x9, x13
+            b.eq find_max_body_end%=
+
+            .inst 0xa009c76c  // ld1w {z12.s-z15.s}, pn9/z, [x27, x9, LSL #2]      // z12-z15: x
+            .inst 0xc1acb910  // fmax {z16.s-z19.s}, {z16.s-z19.s}, {z12.s-z15.s}  // z16-z19: max_value = max(max_value, x)
+
+            incw x9, ALL, MUL #4
+            b find_max_body_start%=
+find_max_body_end%=:
+
+            // Loop for processing the leftover part.
+find_max_leftover_start%=:
+            whilelo p1.s, x9, %x[length]
+            b.none find_max_leftover_end%=
+
+            ld1w z12.s, p1/z, [x27, x9, LSL #2]                                // z12: x
+            fmax z16.s, p1/m, z16.s, z12.s                                     // z16: max_value = max(max_value, x)
+
+            incw x9
+            b find_max_leftover_start%=
+find_max_leftover_end%=:
+
+            // ---------------------------------------------------------------- z16: max_value
+            .inst 0xc1b2b110  // fmax {z16.s-z17.s}, {z16.s-z17.s}, {z18.s-z19.s}
+            fmax z16.s, p0/m, z16.s, z17.s
+            fmaxv s16, p0, z16.s
+
+            // ---------------------------------------------------------------- z11: max_value
+            dup z11.s, z16.s[0]
+
+            // ==================================================
+            // Step 2: Regularize
+            // ==================================================
+
+            .inst 0xc00800ff  // zero {za0.s, za1.s, za2.s, za3.s}              za0-za3: sum_value
+
+            // Loop for processing 4 vectors per iteration.
+            mov x9, #0  // ---------------------------------------------------- x9: index
+
+regularize_body_start%=:
+            cmp x9, x13
+            b.eq regularize_body_end%=
+
+            // Loads the input data to 4 consecutive registers ---------------- z12-z15: input_data
+            .inst 0xa009c76c  // ld1w {z12.s-z15.s}, pn9/z, [x27, x9, LSL #2]
+
+            // ---------------------------------------------------------------- z12-z15: x = input_data - max_value
+            fsub z12.s, z12.s, z11.s
+            fsub z13.s, z13.s, z11.s
+            fsub z14.s, z14.s, z11.s
+            fsub z15.s, z15.s, z11.s
+
+            // ---------------------------------------------------------------- z12-z15: x = (input_data - max_value) * beta
+            fmul z12.s, z12.s, z26.s
+            fmul z13.s, z13.s, z26.s
+            fmul z14.s, z14.s, z26.s
+            fmul z15.s, z15.s, z26.s
+
+            // ---------------------------------------------------------------- z16-z19: shift
+            mov z16.d, z5.d
+            mov z17.d, z5.d
+            mov z18.d, z5.d
+            mov z19.d, z5.d
+
+            // ---------------------------------------------------------------- p4-p7: underflow = x < min_input
+            fcmlt p4.s, p0/z, z12.s, z9.s
+            fcmlt p5.s, p0/z, z13.s, z9.s
+            fcmlt p6.s, p0/z, z14.s, z9.s
+            fcmlt p7.s, p0/z, z15.s, z9.s
+
+            // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2
+            fmla z16.s, p0/m, z12.s, z6.s
+            fmla z17.s, p0/m, z13.s, z6.s
+            fmla z18.s, p0/m, z14.s, z6.s
+            fmla z19.s, p0/m, z15.s, z6.s
+
+            // ---------------------------------------------------------------- z20-z23: n = z - shift
+            fsub z20.s, z16.s, z5.s
+            fsub z21.s, z17.s, z5.s
+            fsub z22.s, z18.s, z5.s
+            fsub z23.s, z19.s, z5.s
+
+            // ---------------------------------------------------------------- z12-z15: r_hi = x + n * neg_ln2_hi
+            fmla z12.s, p0/m, z20.s, z7.s
+            fmla z13.s, p0/m, z21.s, z7.s
+            fmla z14.s, p0/m, z22.s, z7.s
+            fmla z15.s, p0/m, z23.s, z7.s
+
+            // ---------------------------------------------------------------- z12-z15: r = r_hi + n * neg_ln2_lo
+            fmla z12.s, p0/m, z20.s, z8.s
+            fmla z13.s, p0/m, z21.s, z8.s
+            fmla z14.s, p0/m, z22.s, z8.s
+            fmla z15.s, p0/m, z23.s, z8.s
+
+            // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n)
+            dup z10.s, #23
+            urshl z16.s, p0/m, z16.s, z10.s
+            urshl z17.s, p0/m, z17.s, z10.s
+            urshl z18.s, p0/m, z18.s, z10.s
+            urshl z19.s, p0/m, z19.s, z10.s
+
+            // Processes the first 2 vectors.
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z12.s, z0.s
+            fmul z21.s, z13.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z12.s, z2.s
+            fmla z23.s, p0/m, z13.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z35: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z12.s, z4.s
+            fmla z25.s, p0/m, z13.s, z4.s
+
+            // ---------------------------------------------------------------- z12-z13: r2 = r * r
+            fmul z12.s, z12.s, z12.s
+            fmul z13.s, z13.s, z13.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z12.s, z24.s
+            fmla z23.s, p0/m, z13.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z12.s, z22.s
+            fmla z21.s, p0/m, z13.s, z23.s
+
+            // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale
+            fmla z16.s, p0/m, z20.s, z16.s
+            fmla z17.s, p0/m, z21.s, z17.s
+
+            // Processes the last 2 vectors
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z14.s, z0.s
+            fmul z21.s, z15.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z14.s, z2.s
+            fmla z23.s, p0/m, z15.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z35: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z14.s, z4.s
+            fmla z25.s, p0/m, z15.s, z4.s
+
+            // ---------------------------------------------------------------- z14-z15: r2 = r * r
+            fmul z14.s, z14.s, z14.s
+            fmul z15.s, z15.s, z15.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z14.s, z24.s
+            fmla z23.s, p0/m, z15.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z14.s, z22.s
+            fmla z21.s, p0/m, z15.s, z23.s
+
+            // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale
+            fmla z18.s, p0/m, z20.s, z18.s
+            fmla z19.s, p0/m, z21.s, z19.s
+
+            // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly
+            dup z10.s, #0
+            sel z16.s, p4, z10.s, z16.s
+            sel z17.s, p5, z10.s, z17.s
+            sel z18.s, p6, z10.s, z18.s
+            sel z19.s, p7, z10.s, z19.s
+
+            // Stores 4 consecutive registers to the output
+            .inst 0xa029c790  // st1w {z16.s-z19.s}, pn9, [x28, x9, LSL #2]
+
+            .inst 0xc1a17e00  // fadd za.s[w11, #0, VGx4], {z16.s-z19.s}        za0-za3: sum_value = sum_value + poly
+
+            incw x9, ALL, MUL #4
+            b regularize_body_start%=
+regularize_body_end%=:
+
+            // ---------------------------------------------------------------- z28: sum_value
+            .inst 0xc0066c1c  // mova {z28.s-z31.s}, za.s[w11, #0, VGx4]
+            fadd z28.s, z28.s, z29.s
+            fadd z30.s, z30.s, z31.s
+            fadd z28.s, z28.s, z30.s
+
+            // Loop for processing the leftover part.
+regularize_leftover_start%=:
+            whilelo p1.s, x9, %x[length]
+            b.none regularize_leftover_end%=
+
+            ld1w z12.s, p1/z, [x27, x9, LSL #2]                                // x12: input_data
+
+            fsub z12.s, z12.s, z11.s                                           // z12: x = input_data - max_value
+            fmul z12.s, z12.s, z26.s                                           // z12: x = (input_data - max_value) * beta
+
+            mov z16.d, z5.d                                                    // z16: shift
+            fcmlt p4.s, p1/z, z12.s, z9.s                                      // p4: underflow = x < min_input
+            fmla z16.s, p1/m, z12.s, z6.s                                      // z16: z = shift + x * inv_ln2
+            fsub z20.s, z16.s, z5.s                                            // z20: n = z - shift
+            fmla z12.s, p1/m, z20.s, z7.s                                      // z12: r_hi = x + n * neg_ln2_hi
+            fmla z12.s, p1/m, z20.s, z8.s                                      // z12: r = r_hi + n * neg_ln2_lo
+            dup z10.s, #23                                                     // z10: 23
+            urshl z16.s, p1/m, z16.s, z10.s                                    // z16: scale = z << 23 (2^n)
+            fmul z20.s, z12.s, z0.s                                            // z20: p1 = r * c1
+            mov z22.d, z1.d                                                    // z22: p23 = c2
+            fmla z22.s, p1/m, z12.s, z2.s                                      // z22: p23 = c2 + r * c3
+            mov z24.d, z3.d                                                    // z24: c4
+            fmla z24.s, p1/m, z12.s, z4.s                                      // z24: p45 = c4 + r * c5
+            fmul z12.s, z12.s, z12.s                                           // z12: r2 = r * r
+            fmla z22.s, p1/m, z12.s, z24.s                                     // z22: p2345 = p23 + r2 * p45
+            fmla z20.s, p1/m, z12.s, z22.s                                     // z20: p12345 = p1 + r2 * p2345
+            fmla z16.s, p1/m, z20.s, z16.s                                     // z16: poly = scale + p12345 * scale
+            dup z10.s, #0                                                      // z10: 0
+            sel z16.s, p4, z10.s, z16.s                                        // z16: poly = underflow ? 0 : poly
+
+            st1w z16.s, p1, [x28, x9, LSL #2]
+
+            fadd z28.s, p1/m, z28.s, z16.s                                     // z28: sum_value = sum_value + poly
+
+            incw x9
+            b regularize_leftover_start%=
+regularize_leftover_end%=:
+
+            // ==================================================
+            // Step 3: Normalize
+            // ==================================================
+
+            // ---------------------------------------------------------------- z28: inv_sum_value = 1 / sum_value
+            fmov s29, #1.0  // 1.0f
+            faddv s28, p0, z28.s
+            fdiv s28, s29, s28
+            dup z28.s, z28.s[0]
+
+            // Loop for processing 4 vectors per iteration.
+            mov x9, #0                                                         // x9: index
+
+normalize_body_start%=:
+            cmp x9, x13
+            b.eq normalize_body_end%=
+
+            .inst 0xa009c78c  // ld1w {z12.s-z15.s}, pn9/z, [x28, x9, LSL #2]  // z12-z15: x
+
+            // ---------------------------------------------------------------- z12-z15: result = x * inv_sum_value
+            fmul z12.s, z12.s, z28.s
+            fmul z13.s, z13.s, z28.s
+            fmul z14.s, z14.s, z28.s
+            fmul z15.s, z15.s, z28.s
+
+            .inst 0xa029c78c  // st1w {z12.s-z15.s}, pn9, [x28, x9, LSL #2]
+
+            incw x9, ALL, MUL #4
+            b normalize_body_start%=
+normalize_body_end%=:
+
+            // Loop for processing the leftover part.
+normalize_leftover_start%=:
+            whilelo p1.s, x9, %x[length]
+            b.none normalize_leftover_end%=
+
+            ld1w z12.s, p1/z, [x28, x9, LSL #2]                                // z12: x
+            fmul z12.s, z12.s, z28.s                                           // z12: result = x * inv_sum_value
+
+            st1w z12.s, p1, [x28, x9, LSL #2]
+
+            incw x9
+            b normalize_leftover_start%=
+normalize_leftover_end%=:
+
+            // ==================================================
+            // 3D loop closing
+            // ==================================================
+
+            add x27, x27, %x[src_stride_1]
+            add x28, x28, %x[dst_stride_1]
+            b loop_1_start%=
+loop_1_end%=:
+
+            add x24, x24, %x[src_stride_2]
+            add x25, x25, %x[dst_stride_2]
+            b loop_2_start%=
+loop_2_end%=:
+
+            add x21, x21, %x[src_stride_3]
+            add x22, x22, %x[dst_stride_3]
+            b loop_3_start%=
+loop_3_end%=:
+
+            .inst 0xd503467f  // smstop
+        )"
+        :
+        : [src] "r"(src), [dst] "r"(dst), [beta] "r"(beta),                          //
+          [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]), //
+          [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]),
+          [src_stride_3] "r"(src_strides[3]), //
+          [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]),
+          [dst_stride_3] "r"(dst_strides[3]),                            //
+          [length] "r"(shape[0])                                         //
+        : "cc", "memory",                                                //
+          "p0", "p4", "p5", "p6", "p7", "p9",                            //
+          "x9", "x10", "x11", "x12", "x13",                              //
+          "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", //
+          "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",                //
+          "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",          //
+          "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",        //
+          "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"         //
+    );
+}
+
+void sme2_fp32_softmax(const ITensor *in,
+                       void *const,
+                       ITensor      *out,
+                       const float   beta,
+                       int           axis,
+                       const Window &window,
+                       const float  *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(lut_ptr);
+    ARM_COMPUTE_UNUSED(axis);
+
+    const auto *src_info = in->info();
+    const auto *dst_info = out->info();
+
+    const auto &full_shape  = dst_info->tensor_shape();
+    const auto &src_strides = src_info->strides_in_bytes();
+    const auto &dst_strides = dst_info->strides_in_bytes();
+
+    const uintptr_t k_shape[] = {
+        full_shape[0],
+        window.num_iterations(1),
+        window.num_iterations(2),
+        window.num_iterations(3),
+    };
+
+    const uintptr_t k_src_strides[] = {
+        src_strides[0],
+        src_strides[1],
+        src_strides[2],
+        src_strides[3],
+    };
+
+    const uintptr_t k_dst_strides[] = {
+        dst_strides[0],
+        dst_strides[1],
+        dst_strides[2],
+        dst_strides[3],
+    };
+
+    const uintptr_t k_src_offset = window[0].start() * src_strides[0] + //
+                                   window[1].start() * src_strides[1] + //
+                                   window[2].start() * src_strides[2] + //
+                                   window[3].start() * src_strides[3];
+
+    const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + //
+                                   window[1].start() * dst_strides[1] + //
+                                   window[2].start() * dst_strides[2] + //
+                                   window[3].start() * dst_strides[3];
+
+    const auto *k_src = reinterpret_cast<const float *>(in->buffer() + k_src_offset);
+    auto       *k_dst = reinterpret_cast<float *>(out->buffer() + k_dst_offset);
+
+    sme2_f32_softmax_kernel(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
new file mode 100644
index 0000000000..9feb669f7c
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
@@ -0,0 +1,634 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+// SoftMax
+//
+// Steps:
+//   * Find max:   max_value = max(src)
+//   * Regularize: dst[i] = exp(src[i] - max_value)
+//                 sum_value = sum(dst)
+//   * Normalize:  dst[i] = dst[i] / sum_value
+void sme2_qasymm8_softmax_kernel_512VL( //
+    const uint8_t  *src,
+    uint8_t        *dst,
+    float           beta,
+    const uintptr_t shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t dst_strides[4],
+    const float    *lut,
+    float          *tmp)
+{
+    // Precondition:
+    //   * src_strides[0] == sizeof(uint8_t)
+    //   * dst_strides[0] == sizeof(uint8_t)
+    //   * tmp_strides[0] == sizeof(float)
+
+    __asm__ volatile(
+        R"(
+            .inst 0xd503477f // smstart
+
+            // Registers
+            //
+            //   *  x1: Loop index
+            //   *  x2: LUT index
+            //   * x13: temporary, body_length
+            //
+            //   * x20: index_3
+            //   * x21: src_3
+            //   * x22: dst_3
+            //   * x23: index_2
+            //   * x24: src_2
+            //   * x25: dst_2
+            //   * x26: index_1
+            //   * x27: src_1
+            //   * x28: dst_1
+            //   * x29  tmp
+            //
+            //
+            //   * p0: all-true
+            //   * p1: predicate for QASYMM8 values
+            //   * p2: predicate 0 for FP32 values (first quarter of expanded/unpacked p1)
+            //   * p3: predicate 1 for FP32 values (second quarter of expanded/unpacked p1)
+            //   * p4: predicate 2 for FP32 values (third quarter of expanded/unpacked p1)
+            //   * p5: predicate 3 for FP32 values (fourth quarter of expanded/unpacked p1)
+            //   * pn9: all-true for 32 bit values
+            //   * pn8: all-true for 8-bit values
+            //
+            //   * z0-z15 the 256 LUT values of exp(-scale*beta*x) for x in QASYMM8, stored as FP32 values
+
+            // Prepares all constant values
+
+            ptrue p0.b
+            .inst 0x25a07811  // ptrue pn9.s
+            .inst 0x25207810  // ptrue pn8.b
+
+            // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl
+            cntb x13, ALL, MUL #4
+            udiv x9, %x[length], x13
+            mul x13, x13, x9
+
+            // ==================================================
+            // 3D loop opening
+            // ==================================================
+
+            mov x20, %x[shape_3]
+            mov x21, %x[src]
+            mov x22, %x[dst]
+            mov x19, %x[lut]
+            mov x29, %x[tmp]
+
+            // Load the LUT to the register file.
+            mov x2, %x[lut]
+            .inst 0xa040c440 //ld1w    { z0.s - z3.s }, pn9/z, [x2]
+            add x2, x2, #256
+            .inst 0xa040c444 //ld1w    { z4.s - z7.s }, pn9/z, [x2]
+            add x2, x2, #256
+            .inst 0xa040c448 //ld1w    { z8.s - z11.s }, pn9/z, [x2]
+            add x2, x2, #256
+            .inst 0xa040c44c //ld1w    { z12.s - z15.s }, pn9/z, [x2]
+
+
+loop_3_start%=:
+            // for index_3 in shape_3 downto 1
+            cmp x20, #0
+            b.eq loop_3_end%=
+            sub x20, x20, #1
+
+            mov x23, %x[shape_2]
+            mov x24, x21
+            mov x25, x22
+
+loop_2_start%=:
+            // for index_2 in shape_2 downto 1
+            cmp x23, #0
+            b.eq loop_2_end%=
+            sub x23, x23, #1
+
+            mov x26, %x[shape_1]
+            mov x27, x24
+            mov x28, x25
+
+loop_1_start%=:
+            // for index_1 in shape_2 downto 1
+            cmp x26, #0
+            b.eq loop_1_end%=
+            sub x26, x26, #1
+
+            // ==================================================
+            // Step 1: Find max
+            // ==================================================
+            // z16-z19 = minimum QASYMM8 value (0) to allow for it to be used for comparison to find the max.
+            dup z16.b, #0
+            dup z17.b, #0
+            dup z18.b, #0
+            dup z19.b, #0
+            mov x1, #0                                                  // x1: index
+find_max_body_start%=:
+            cmp x1, x13
+            b.eq find_max_body_end%=
+            .inst 0xa0018374 // ld1b    { z20.b - z23.b }, pn8/z, [x27, x1]  z20-z23: x
+            .inst 0xc134b811 // umax    { z16.b - z19.b }, { z16.b - z19.b }, { z20.b - z23.b } z16-z19: max_value = max(max_value, x)
+            add x1, x1, #256 // Advance index by 256 bytes/integers: Z registers = 2048-bit data = 256 8-bit integers.
+            b find_max_body_start%=
+find_max_body_end%=:
+
+            // Loop for processing the leftover part.
+find_max_leftover_start%=:
+            whilelo p1.b, x1, %x[length]
+            b.none find_max_leftover_end%=
+
+            ld1b z30.b, p1/z, [x27, x1]                                // z30: x
+            umax z16.b, p1/m, z16.b, z30.b                             // z16: max_value = max(max_value, x)
+
+            add x1, x1, #64
+
+            b find_max_leftover_start%=
+find_max_leftover_end%=:
+
+            .inst 0xc132b011 // umax    { z16.b, z17.b }, { z16.b, z17.b }, { z18.b, z19.b }
+            umax z16.b, p0/m, z16.b, z17.b
+            umaxv b16, p0, z16.b // Reduction unsigned max operation to get maximum_value
+            dup z16.b, z16.b[0]
+            uunpklo z16.h, z16.b // Using unpack instructions to align the max value with the FP32 entries in the LUT for use in the TBX instruction
+            uunpklo z16.s, z16.h
+
+            mov x1, #0 // reset index
+            dup z25.s, #0
+
+            mov x1, #0
+
+regularize_start%=:
+            whilelo p1.b, x1, %x[length]
+            b.none regularize_end%=
+
+            // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated
+            punpklo  p2.h, p1.b
+            punpkhi  p4.h, p1.b
+
+            punpkhi  p3.h, p2.b
+            punpklo  p2.h, p2.b
+
+            punpkhi  p5.h, p4.b
+            punpklo  p4.h, p4.b
+
+            ld1b z17.b, p1/z, [x27, x1] //z17: input data
+
+            uunpklo z18.h, z17.b //Using unpack instructions to align the input QASYMM8 values with the FP32 entries in the LUT for use in the TBX instruction
+            uunpkhi z19.h, z17.b
+
+            uunpklo z17.s, z18.h // z17 = low  low  input QASYMM8 values
+            uunpkhi z18.s, z18.h // z18 = low  high input QASYMM8 values
+
+            uunpkhi z20.s, z19.h // z20 = high high input QASYMM8 values
+            uunpklo z19.s, z19.h // z19 = high low  input QASYMM8 values
+
+            sub z17.s, z16.s, z17.s                                          // z12: x =  max_value - input_data
+            sub z18.s, z16.s, z18.s                                          // z13: x =  max_value - input_data
+            sub z19.s, z16.s, z19.s                                          // z14: x =  max_value - input_data
+            sub z20.s, z16.s, z20.s                                          // z15: x =  max_value - input_data
+
+            tbx z21.s, z0.s, z17.s  // Look-up entries 0-15 in the LUT.
+            tbx z22.s, z0.s, z18.s
+            tbx z23.s, z0.s, z19.s
+            tbx z24.s, z0.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z1.s, z17.s  // Look-up entries 16-31 in the LUT.
+            tbx z22.s, z1.s, z18.s
+            tbx z23.s, z1.s, z19.s
+            tbx z24.s, z1.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z2.s, z17.s  // Look-up entries 32-47 in the LUT.
+            tbx z22.s, z2.s, z18.s
+            tbx z23.s, z2.s, z19.s
+            tbx z24.s, z2.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z3.s, z17.s  // Look-up entries 48-63 in the LUT.
+            tbx z22.s, z3.s, z18.s
+            tbx z23.s, z3.s, z19.s
+            tbx z24.s, z3.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z4.s, z17.s  // Look-up entries 64-79 in the LUT.
+            tbx z22.s, z4.s, z18.s
+            tbx z23.s, z4.s, z19.s
+            tbx z24.s, z4.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z5.s, z17.s  // Look-up entries 80-95 in the LUT.
+            tbx z22.s, z5.s, z18.s
+            tbx z23.s, z5.s, z19.s
+            tbx z24.s, z5.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z6.s, z17.s  // Look-up entries 96-111 in the LUT.
+            tbx z22.s, z6.s, z18.s
+            tbx z23.s, z6.s, z19.s
+            tbx z24.s, z6.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z7.s, z17.s  // Look-up entries 112-127 in the LUT.
+            tbx z22.s, z7.s, z18.s
+            tbx z23.s, z7.s, z19.s
+            tbx z24.s, z7.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z8.s, z17.s  // Look-up entries 128-143 in the LUT.
+            tbx z22.s, z8.s, z18.s
+            tbx z23.s, z8.s, z19.s
+            tbx z24.s, z8.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z9.s, z17.s  // Look-up entries 144-159 in the LUT.
+            tbx z22.s, z9.s, z18.s
+            tbx z23.s, z9.s, z19.s
+            tbx z24.s, z9.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z10.s, z17.s  // Look-up entries 160-175 in the LUT.
+            tbx z22.s, z10.s, z18.s
+            tbx z23.s, z10.s, z19.s
+            tbx z24.s, z10.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z11.s, z17.s  // Look-up entries 176-191 in the LUT.
+            tbx z22.s, z11.s, z18.s
+            tbx z23.s, z11.s, z19.s
+            tbx z24.s, z11.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z12.s, z17.s  // Look-up entries 192-207 in the LUT.
+            tbx z22.s, z12.s, z18.s
+            tbx z23.s, z12.s, z19.s
+            tbx z24.s, z12.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z13.s, z17.s  // Look-up entries 208-223 in the LUT.
+            tbx z22.s, z13.s, z18.s
+            tbx z23.s, z13.s, z19.s
+            tbx z24.s, z13.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z14.s, z17.s  // Look-up entries 224-239 in the LUT.
+            tbx z22.s, z14.s, z18.s
+            tbx z23.s, z14.s, z19.s
+            tbx z24.s, z14.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z15.s, z17.s  // Look-up entries 240-255 in the LUT.
+            tbx z22.s, z15.s, z18.s
+            tbx z23.s, z15.s, z19.s
+            tbx z24.s, z15.s, z20.s
+
+
+            st1w z21.s, p2, [x29, x1, LSL #2]// z21 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p2/m, z25.s, z21.s
+            add x1, x1, #16
+
+            st1w z22.s, p3, [x29, x1, LSL #2]// z22 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p3/m, z25.s, z22.s
+            add x1, x1, #16
+
+            st1w z23.s, p4, [x29, x1, LSL #2]// z23 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p4/m, z25.s, z23.s
+            add x1, x1, #16
+
+            st1w z24.s, p5, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p5/m, z25.s, z24.s
+            add x1, x1, #16
+
+            b regularize_start%=
+regularize_end%=:
+
+            mov w9, 0x0000
+            movk w9, 0x4380, LSL #16 // Moving 256.f into w9 to scale - via multiplication (division by reciprocal) - the floating point [0,1] range of the results to the [0,255] integer range of QASYMM8
+            dup z29.s, w9
+            faddv s25, p0, z25.s
+            fdiv s25, s29, s25
+            dup z25.s, z25.s[0] // z25: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax.
+
+            // ==================================================
+            // Step 3: Normalize
+            // ==================================================
+            mov x1, #0
+normalize_body_start%=:
+            cmp x1, x13
+            b.eq normalize_body_end%=
+
+            mov x2, x1       // Preserve the index into x2 for the final store to dst.
+            .inst 0xa001c7b0 // ld1w    { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+            .inst 0xa001c7b4 // ld1w    { z20.s - z23.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+
+            // z16-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z16.s, z25.s, z16.s
+            fmul z17.s, z25.s, z17.s
+            fmul z18.s, z25.s, z18.s
+            fmul z19.s, z25.s, z19.s
+            fmul z20.s, z25.s, z20.s
+            fmul z21.s, z25.s, z21.s
+            fmul z22.s, z25.s, z22.s
+            fmul z23.s, z25.s, z23.s
+
+            // z16-z23: convert the FP32 values from the tmp tensor to uint32.
+            fcvtzu z16.s, p0/m, z16.s
+            fcvtzu z17.s, p0/m, z17.s
+            fcvtzu z18.s, p0/m, z18.s
+            fcvtzu z19.s, p0/m, z19.s
+            fcvtzu z20.s, p0/m, z20.s
+            fcvtzu z21.s, p0/m, z21.s
+            fcvtzu z22.s, p0/m, z22.s
+            fcvtzu z23.s, p0/m, z23.s
+
+            // z16-z17: narrow the uint32 values into uint8 and saturate them.
+            .inst 0xc133e230 // uqcvt    z16.b, { z16.s - z19.s }
+            .inst 0xc133e2b1 // uqcvt    z17.b, { z20.s - z23.s }
+
+            dup z20.s, z25.s[0] // Juggling the value to z20 as z25 will be overwritten by the load below
+
+            .inst 0xa001c7b8 // ld1w    { z24.s - z27.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+            .inst 0xa001c7bc // ld1w    { z28.s - z31.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+
+            // z24-z31: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z24.s, z20.s, z24.s
+            fmul z25.s, z20.s, z25.s
+            fmul z26.s, z20.s, z26.s
+            fmul z27.s, z20.s, z27.s
+            fmul z28.s, z20.s, z28.s
+            fmul z29.s, z20.s, z29.s
+            fmul z30.s, z20.s, z30.s
+            fmul z31.s, z20.s, z31.s
+
+            // z24-z31: convert the FP32 values from the tmp tensor to uint32.
+            fcvtzu z24.s, p0/m, z24.s
+            fcvtzu z25.s, p0/m, z25.s
+            fcvtzu z26.s, p0/m, z26.s
+            fcvtzu z27.s, p0/m, z27.s
+            fcvtzu z28.s, p0/m, z28.s
+            fcvtzu z29.s, p0/m, z29.s
+            fcvtzu z30.s, p0/m, z30.s
+            fcvtzu z31.s, p0/m, z31.s
+
+            // z18-z19: narrow the uint32 values into uint8 and saturate them.
+            .inst 0xc133e332 // uqcvt    z18.b, { z24.s - z27.s }
+            .inst 0xc133e3b3 // uqcvt    z19.b, { z28.s - z31.s }
+
+            .inst 0xa0228390 // st1b    { z16.b - z19.b }, pn8, [x28, x2]
+
+            dup z25.s, z20.s[0] // Juggling the value back to z25 as z20 will be overwritten by the next iteration or z25 will be used below.
+
+b normalize_body_start%=
+normalize_body_end%=:
+
+normalize_leftover_start%=:
+            whilelo p1.b, x1, %x[length]
+            b.none normalize_leftover_end%=
+
+            // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated
+            punpklo  p2.h, p1.b
+            punpkhi  p4.h, p1.b
+
+            punpkhi  p3.h, p2.b
+            punpklo  p2.h, p2.b
+
+            punpkhi  p5.h, p4.b
+            punpklo  p4.h, p4.b
+
+            mov x2, x1 // Preserve the index into x2 for the final store to dst.
+
+            // z20-z23: load exp(-scale*beta*x) from the tmp tensor
+            ld1w z20.s, p2/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            ld1w z21.s, p3/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            ld1w z22.s, p4/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            ld1w z23.s, p5/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            // z20-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z20.s, z25.s, z20.s
+            fmul z21.s, z25.s, z21.s
+            fmul z22.s, z25.s, z22.s
+            fmul z23.s, z25.s, z23.s
+
+            // z20-23: convert the FP32 values from the tmp tensor to uint32.
+            fcvtzu z20.s, p0/m, z20.s
+            fcvtzu z21.s, p0/m, z21.s
+            fcvtzu z22.s, p0/m, z22.s
+            fcvtzu z23.s, p0/m, z23.s
+
+            .inst 0xc133e2b3 // uqcvt    z19.b, { z20.s - z23.s }, narrow the uint32 values into uint8 and saturate them into z19.
+
+            st1b z19.b, p1, [x28, x2]
+
+            b normalize_leftover_start%=
+normalize_leftover_end%=:
+            // ==================================================
+            // 3D loop closing
+            // ==================================================
+            add x27, x27, %x[src_stride_1]
+            add x28, x28, %x[dst_stride_1]
+            b loop_1_start%=
+loop_1_end%=:
+
+            add x24, x24, %x[src_stride_2]
+            add x25, x25, %x[dst_stride_2]
+            b loop_2_start%=
+loop_2_end%=:
+
+            add x21, x21, %x[src_stride_3]
+            add x22, x22, %x[dst_stride_3]
+            b loop_3_start%=
+loop_3_end%=:
+            .inst 0xd503467f // smstop
+        )"
+        :
+        : [src] "r"(src), [tmp] "r"(tmp), [dst] "r"(dst), [beta] "r"(beta), [lut] "r"(lut), //
+          [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]),        //
+          [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]),
+          [src_stride_3] "r"(src_strides[3]), //
+          [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]),
+          [dst_stride_3] "r"(dst_strides[3]),                                   //
+          [length] "r"(shape[0])                                                //
+        : "cc", "memory",                                                       //
+          "p0", "p1", "p2", "p3", "p4",                                         //
+          "x2", "x9", "x13",                                                    //
+          "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x19", //
+          "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",                       //
+          "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",                 //
+          "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",               //
+          "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"                //
+    );
+}
+
+void sme2_qasymm8_softmax_lut_512VL(const ITensor *in,
+                                    void *const    tmp,
+                                    ITensor       *out,
+                                    const float    beta,
+                                    int            axis,
+                                    const Window  &window,
+                                    const float   *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(axis);
+
+    const auto *src_info = in->info();
+    const auto *dst_info = out->info();
+
+    const auto &full_shape  = dst_info->tensor_shape();
+    const auto &src_strides = src_info->strides_in_bytes();
+    const auto &dst_strides = dst_info->strides_in_bytes();
+    Strides     tmp_strides;
+
+    tmp_strides[0] = src_strides[0] * 4;
+    tmp_strides[1] = src_strides[1] * 4;
+    tmp_strides[2] = src_strides[2] * 4;
+    tmp_strides[3] = src_strides[3] * 4;
+
+    const uintptr_t k_shape[] = {
+        full_shape[0],
+        window.num_iterations(1),
+        window.num_iterations(2),
+        window.num_iterations(3),
+    };
+
+    const uintptr_t k_src_strides[] = {
+        src_strides[0],
+        src_strides[1],
+        src_strides[2],
+        src_strides[3],
+    };
+
+    const uintptr_t k_dst_strides[] = {
+        dst_strides[0],
+        dst_strides[1],
+        dst_strides[2],
+        dst_strides[3],
+    };
+
+    const uintptr_t k_src_offset = window[0].start() * src_strides[0] + //
+                                   window[1].start() * src_strides[1] + //
+                                   window[2].start() * src_strides[2] + //
+                                   window[3].start() * src_strides[3];
+
+    const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + //
+                                   window[1].start() * dst_strides[1] + //
+                                   window[2].start() * dst_strides[2] + //
+                                   window[3].start() * dst_strides[3];
+
+    const uintptr_t k_tmp_offset = window[0].start() * tmp_strides[0] + //
+                                   window[1].start() * tmp_strides[1] + //
+                                   window[2].start() * tmp_strides[2] + //
+                                   window[3].start() * tmp_strides[3];
+
+    const auto *k_src         = reinterpret_cast<const uint8_t *>(in->buffer() + k_src_offset);
+    float      *tmp_float_ptr = reinterpret_cast<float *>(tmp);
+    auto       *k_tmp         = reinterpret_cast<float *>(tmp_float_ptr + k_tmp_offset);
+    auto       *k_dst         = reinterpret_cast<uint8_t *>(out->buffer() + k_dst_offset);
+
+    sme2_qasymm8_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..14c0f6c327
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+// SoftMax
+//
+// Steps:
+//   * Find max:   max_value = max(src)
+//   * Regularize: dst[i] = exp(src[i] - max_value)
+//                 sum_value = sum(dst)
+//   * Normalize:  dst[i] = dst[i] / sum_value
+void sme2_qasymm8_signed_softmax_kernel_512VL( //
+    const int8_t   *src,
+    int8_t         *dst,
+    float           beta,
+    const uintptr_t shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t dst_strides[4],
+    const float    *lut,
+    float          *tmp)
+{
+    // Precondition:
+    //   * src_strides[0] == sizeof(int8_t)
+    //   * dst_strides[0] == sizeof(int8_t)
+    //   * tmp_strides[0] == sizeof(float)
+
+    __asm__ volatile(
+        R"(
+            .inst 0xd503477f  // smstart
+
+            // For register list explanation refer to qasymm8.cpp.
+
+            // Prepares all constant values
+
+            ptrue p0.b
+            .inst 0x25a07811  // ptrue pn9.s
+            .inst 0x25207810  // ptrue pn8.b
+
+            // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl
+            cntb x13, ALL, MUL #4
+            udiv x9, %x[length], x13
+            mul x13, x13, x9
+
+            // ==================================================
+            // 3D loop opening
+            // ==================================================
+
+            mov x20, %x[shape_3]
+            mov x21, %x[src]
+            mov x22, %x[dst]
+            mov x19, %x[lut]
+            mov x29, %x[tmp]
+
+            // Load the LUT to the register file.
+            mov x2, %x[lut]
+            .inst 0xa040c440  //ld1w    { z0.s - z3.s }, pn9/z, [x2]
+            add x2, x2, #256
+            .inst 0xa040c444  //ld1w    { z4.s - z7.s }, pn9/z, [x2]
+            add x2, x2, #256
+            .inst 0xa040c448  //ld1w    { z8.s - z11.s }, pn9/z, [x2]
+            add x2, x2, #256
+            .inst 0xa040c44c  //ld1w    { z12.s - z15.s }, pn9/z, [x2]
+
+
+loop_3_start%=:
+            // for index_3 in shape_3 downto 1
+            cmp x20, #0
+            b.eq loop_3_end%=
+            sub x20, x20, #1
+
+            mov x23, %x[shape_2]
+            mov x24, x21
+            mov x25, x22
+
+loop_2_start%=:
+            // for index_2 in shape_2 downto 1
+            cmp x23, #0
+            b.eq loop_2_end%=
+            sub x23, x23, #1
+
+            mov x26, %x[shape_1]
+            mov x27, x24
+            mov x28, x25
+
+loop_1_start%=:
+            // for index_1 in shape_2 downto 1
+            cmp x26, #0
+            b.eq loop_1_end%=
+            sub x26, x26, #1
+
+            // ==================================================
+            // Step 1: Find max
+            // ==================================================
+            // z16-z19 = minimum QASYMM8_SIGNED value (-128) to allow for it to be used for comparison to find the max.
+            dup z16.b, #0x80
+            dup z17.b, #0x80
+            dup z18.b, #0x80
+            dup z19.b, #0x80
+
+            mov x1, #0                                                  // x1: index
+find_max_body_start%=:
+            cmp x1, x13
+            b.eq find_max_body_end%=
+            .inst 0xa0018374 // ld1b    { z20.b - z23.b }, pn8/z, [x27, x1]  z16-z19: x
+            .inst 0xc134b810 // smax    { z16.b - z19.b }, { z16.b - z19.b }, { z20.b - z23.b } z16-z19: max_value = max(max_value, x)
+            add x1, x1, #256 // Advance index by 256 bytes/integers: Z registers = 2048-bit data = 256 8-bit integers.
+ b find_max_body_start%=
+find_max_body_end%=:
+
+            // Loop for processing the leftover part.
+find_max_leftover_start%=:
+            whilelo p1.b, x1, %x[length]
+            b.none find_max_leftover_end%=
+
+            ld1b z30.b, p1/z, [x27, x1]                                // z30: x
+            smax z16.b, p1/m, z16.b, z30.b                             // z16: max_value = max(max_value, x)
+
+            add x1, x1, #64
+
+            b find_max_leftover_start%=
+find_max_leftover_end%=:
+            .inst 0xc132b010 // smax    { z16.b, z17.b }, { z16.b, z17.b }, { z18.b, z19.b }
+            smax z16.b, p0/m, z16.b, z17.b
+            smaxv b16, p0, z16.b // Reduction signed max operation to get maximum_value
+            mov z16.b, b16       // z16: duplicated max_value for current row
+
+            sunpklo z16.h, z16.b // Using unpack instructions to align the max value with the FP32 entries in the LUT for use in the TBX instruction
+            sunpklo z16.s, z16.h
+
+            mov x1, #0 // reset index
+            dup z25.s, #0
+
+
+regularize_start%=:
+            whilelo p1.b, x1, %x[length]
+            b.none regularize_end%=
+
+            mov w9, 0xFF80
+            movk w9, 0xFFFF, LSL #16 // Moving -127.f into w9 to set the registers below to the minimum QASYMM8_SIGNED value
+            dup z17.s, w9
+            dup z18.s, w9
+            dup z19.s, w9
+            dup z20.s, w9
+
+            dup z21.s, #0x0
+            dup z22.s, #0x0
+            dup z23.s, #0x0
+            dup z24.s, #0x0
+
+            // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated
+            punpklo  p2.h, p1.b
+            punpkhi  p4.h, p1.b
+
+            punpkhi  p3.h, p2.b
+            punpklo  p2.h, p2.b
+
+            punpkhi  p5.h, p4.b
+            punpklo  p4.h, p4.b
+
+            ld1b z17.b, p1/z, [x27, x1] //z17: input data
+
+            sunpklo z18.h, z17.b // Using unpack instructions to align the input QASYMM8_SIGNED values with the FP32 entries in the LUT for use in the TBX instruction
+            sunpkhi z19.h, z17.b //
+
+            sunpklo z17.s, z18.h // z17 = low  low  input QASYMM8_SIGNED values
+            sunpkhi z18.s, z18.h // z18 = low  high input QASYMM8_SIGNED values
+
+            sunpkhi z20.s, z19.h // z20 = high high input QASYMM8_SIGNED values
+            sunpklo z19.s, z19.h // z19 = high low  input QASYMM8_SIGNED values
+
+            sub z17.s, z16.s, z17.s                                          // z12: x =  max_value - input_data
+            sub z18.s, z16.s, z18.s                                          // z13: x =  max_value - input_data
+            sub z19.s, z16.s, z19.s                                          // z14: x =  max_value - input_data
+            sub z20.s, z16.s, z20.s                                          // z15: x =  max_value - input_data
+
+            add z17.s, z17.s, #128
+            add z18.s, z18.s, #128
+            add z19.s, z19.s, #128
+            add z20.s, z20.s, #128
+
+            tbx z21.s, z0.s, z17.s  // Look-up entries 0-15 in the LUT.
+            tbx z22.s, z0.s, z18.s
+            tbx z23.s, z0.s, z19.s
+            tbx z24.s, z0.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z1.s, z17.s  // Look-up entries 16-31 in the LUT.
+            tbx z22.s, z1.s, z18.s
+            tbx z23.s, z1.s, z19.s
+            tbx z24.s, z1.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z2.s, z17.s  // Look-up entries 32-47 in the LUT.
+            tbx z22.s, z2.s, z18.s
+            tbx z23.s, z2.s, z19.s
+            tbx z24.s, z2.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z3.s, z17.s  // Look-up entries 48-63 in the LUT.
+            tbx z22.s, z3.s, z18.s
+            tbx z23.s, z3.s, z19.s
+            tbx z24.s, z3.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z4.s, z17.s  // Look-up entries 64-79 in the LUT.
+            tbx z22.s, z4.s, z18.s
+            tbx z23.s, z4.s, z19.s
+            tbx z24.s, z4.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z5.s, z17.s  // Look-up entries 80-95 in the LUT.
+            tbx z22.s, z5.s, z18.s
+            tbx z23.s, z5.s, z19.s
+            tbx z24.s, z5.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z6.s, z17.s  // Look-up entries 96-111 in the LUT.
+            tbx z22.s, z6.s, z18.s
+            tbx z23.s, z6.s, z19.s
+            tbx z24.s, z6.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z7.s, z17.s  // Look-up entries 112-127 in the LUT.
+            tbx z22.s, z7.s, z18.s
+            tbx z23.s, z7.s, z19.s
+            tbx z24.s, z7.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z8.s, z17.s  // Look-up entries 128-143 in the LUT.
+            tbx z22.s, z8.s, z18.s
+            tbx z23.s, z8.s, z19.s
+            tbx z24.s, z8.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z9.s, z17.s  // Look-up entries 144-159 in the LUT.
+            tbx z22.s, z9.s, z18.s
+            tbx z23.s, z9.s, z19.s
+            tbx z24.s, z9.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z10.s, z17.s  // Look-up entries 160-175 in the LUT.
+            tbx z22.s, z10.s, z18.s
+            tbx z23.s, z10.s, z19.s
+            tbx z24.s, z10.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z11.s, z17.s  // Look-up entries 176-191 in the LUT.
+            tbx z22.s, z11.s, z18.s
+            tbx z23.s, z11.s, z19.s
+            tbx z24.s, z11.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z12.s, z17.s  // Look-up entries 192-207 in the LUT.
+            tbx z22.s, z12.s, z18.s
+            tbx z23.s, z12.s, z19.s
+            tbx z24.s, z12.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z13.s, z17.s  // Look-up entries 208-223 in the LUT.
+            tbx z22.s, z13.s, z18.s
+            tbx z23.s, z13.s, z19.s
+            tbx z24.s, z13.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z14.s, z17.s  // Look-up entries 224-239 in the LUT.
+            tbx z22.s, z14.s, z18.s
+            tbx z23.s, z14.s, z19.s
+            tbx z24.s, z14.s, z20.s
+
+            sub z17.s, z17.s, #16
+            sub z18.s, z18.s, #16
+            sub z19.s, z19.s, #16
+            sub z20.s, z20.s, #16
+
+            tbx z21.s, z15.s, z17.s  // Look-up entries 240-255 in the LUT.
+            tbx z22.s, z15.s, z18.s
+            tbx z23.s, z15.s, z19.s
+            tbx z24.s, z15.s, z20.s
+
+
+            st1w z21.s, p2, [x29, x1, LSL #2]// z21 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p2/m, z25.s, z21.s
+            add x1, x1, #16
+
+            st1w z22.s, p3, [x29, x1, LSL #2]// z22 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p3/m, z25.s, z22.s
+            add x1, x1, #16
+
+            st1w z23.s, p4, [x29, x1, LSL #2]// z23 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p4/m, z25.s, z23.s
+            add x1, x1, #16
+
+            st1w z24.s, p5, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor
+            fadd z25.s, p5/m, z25.s, z24.s
+            add x1, x1, #16
+
+            b regularize_start%=
+regularize_end%=:
+
+            mov w9, 0x0000
+            movk w9, 0x4380, LSL #16 // Moving 256.f into w9 to scale - via multiplication (division by reciprocal) - the floating point [0,1] range of the results to the [-128, 127] integer range of QASYMM8_SIGNED
+            mov w10, 0x0000
+            movk w10, 0x4300, LSL #16 // Moving 128.f into w10 for the subtraction to move the results - via subtraction - from the [0,255] range to the [-128,127] range
+            dup z29.s, w9
+            dup z30.s, w10
+            faddv s25, p0, z25.s
+            fdiv s25, s29, s25
+            dup z25.s, z25.s[0] // z25: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax.
+
+            // ==================================================
+            // Step 3: Normalize
+            // ==================================================
+            mov x1, #0
+normalize_body_start%=:
+            cmp x1, x13
+            b.eq normalize_body_end%=
+
+            mov x2, x1       // Preserve the index into x2 for the final store to dst.
+            .inst 0xa001c7b0 // ld1w    { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+            .inst 0xa001c7b4 // ld1w    { z20.s - z23.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+
+            // z16-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z16.s, z25.s, z16.s
+            fmul z17.s, z25.s, z17.s
+            fmul z18.s, z25.s, z18.s
+            fmul z19.s, z25.s, z19.s
+            fmul z20.s, z25.s, z20.s
+            fmul z21.s, z25.s, z21.s
+            fmul z22.s, z25.s, z22.s
+            fmul z23.s, z25.s, z23.s
+
+            // z16-z23: subtract 128.f.
+            fsub z16.s, z16.s, z30.s // Subtract 128.f
+            fsub z17.s, z17.s, z30.s // Subtract 128.f
+            fsub z18.s, z18.s, z30.s // Subtract 128.f
+            fsub z19.s, z19.s, z30.s // Subtract 128.f
+            fsub z20.s, z20.s, z30.s // Subtract 128.f
+            fsub z21.s, z21.s, z30.s // Subtract 128.f
+            fsub z22.s, z22.s, z30.s // Subtract 128.f
+            fsub z23.s, z23.s, z30.s // Subtract 128.f
+
+            // z16-z23: convert the FP32 values from the tmp tensor to int32.
+            fcvtzs z16.s, p0/m, z16.s
+            fcvtzs z17.s, p0/m, z17.s
+            fcvtzs z18.s, p0/m, z18.s
+            fcvtzs z19.s, p0/m, z19.s
+            fcvtzs z20.s, p0/m, z20.s
+            fcvtzs z21.s, p0/m, z21.s
+            fcvtzs z22.s, p0/m, z22.s
+            fcvtzs z23.s, p0/m, z23.s
+
+            // z16-z17: narrow the int32 values into int8 and saturate them.
+            .inst 0xc133e210 // sqcvt    z16.b, { z16.s - z19.s }
+            .inst 0xc133e291 // sqcvt    z17.b, { z20.s - z23.s }
+
+            // Juggling the value to z20 (resp. 21) as z25 (resp. z30) will be overwritten by the load below.
+            dup z20.s, z25.s[0]
+            dup z21.s, z30.s[0]
+
+            .inst 0xa001c7b8 // ld1w    { z24.s - z27.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+            .inst 0xa001c7bc // ld1w    { z28.s - z31.s }, pn9/z, [x29, x1, lsl #2]
+            add x1, x1, #64
+
+            // z24-z31: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z24.s, z20.s, z24.s
+            fmul z25.s, z20.s, z25.s
+            fmul z26.s, z20.s, z26.s
+            fmul z27.s, z20.s, z27.s
+            fmul z28.s, z20.s, z28.s
+            fmul z29.s, z20.s, z29.s
+            fmul z30.s, z20.s, z30.s
+            fmul z31.s, z20.s, z31.s
+
+            // z24-z31: subtract 128.f.
+            fsub z24.s, z24.s, z21.s
+            fsub z25.s, z25.s, z21.s
+            fsub z26.s, z26.s, z21.s
+            fsub z27.s, z27.s, z21.s
+            fsub z28.s, z28.s, z21.s
+            fsub z29.s, z29.s, z21.s
+            fsub z30.s, z30.s, z21.s
+            fsub z31.s, z31.s, z21.s
+
+            // z24-z31: convert the FP32 values from the tmp tensor to int32.
+            fcvtzs z24.s, p0/m, z24.s
+            fcvtzs z25.s, p0/m, z25.s
+            fcvtzs z26.s, p0/m, z26.s
+            fcvtzs z27.s, p0/m, z27.s
+            fcvtzs z28.s, p0/m, z28.s
+            fcvtzs z29.s, p0/m, z29.s
+            fcvtzs z30.s, p0/m, z30.s
+            fcvtzs z31.s, p0/m, z31.s
+
+            // z18-z19: narrow the int32 values into int8 and saturate them.
+            .inst 0xc133e312 // sqcvt    z18.b, { z24.s - z27.s }
+            .inst 0xc133e393 // sqcvt    z19.b, { z28.s - z31.s }
+
+            .inst 0xa0228390 // st1b    { z16.b - z19.b }, pn8, [x28, x2]
+
+            // Juggling the values back to z25 (resp. z30) as z20 (resp. z21) will be overwritten by the next iteration or z25 (resp. z30) will be used below.
+            dup z25.s, z20.s[0]
+            dup z30.s, z21.s[0]
+b normalize_body_start%=
+normalize_body_end%=:
+normalize_leftover_start%=:
+            whilelo p1.b, x1, %x[length]
+            b.none normalize_leftover_end%=
+
+            // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated
+            punpklo  p2.h, p1.b
+            punpkhi  p4.h, p1.b
+
+            punpkhi  p3.h, p2.b
+            punpklo  p2.h, p2.b
+
+            punpkhi  p5.h, p4.b
+            punpklo  p4.h, p4.b
+
+            mov x2, x1 // Preserve the index into x2 for the final store to dst.
+
+            // z20-z23: load exp(-scale*beta*x) from the tmp tensor
+            ld1w z20.s, p2/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            ld1w z21.s, p3/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            ld1w z22.s, p4/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            ld1w z23.s, p5/z, [x29, x1, LSL #2]
+            add x1, x1, #16
+
+            // z20-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z20.s, z25.s, z20.s
+            fmul z21.s, z25.s, z21.s
+            fmul z22.s, z25.s, z22.s
+            fmul z23.s, z25.s, z23.s
+
+            //z20-z23: Subtract 128.f.
+            fsub z20.s, z20.s, z30.s
+            fsub z21.s, z21.s, z30.s
+            fsub z22.s, z22.s, z30.s
+            fsub z23.s, z23.s, z30.s
+
+            // z20-23: convert the FP32 values from the tmp tensor to int32.
+            fcvtzs z20.s, p0/m, z20.s
+            fcvtzs z21.s, p0/m, z21.s
+            fcvtzs z22.s, p0/m, z22.s
+            fcvtzs z23.s, p0/m, z23.s
+
+            .inst 0xc133e293 // sqcvt	z19.b, { z20.s - z23.s }, narrow the int32 values into int8 and saturate them into z19.
+
+            st1b z19.b, p1, [x28, x2]
+
+            b normalize_leftover_start%=
+normalize_leftover_end%=:
+            // ==================================================
+            // 3D loop closing
+            // ==================================================
+            add x27, x27, %x[src_stride_1]
+            add x28, x28, %x[dst_stride_1]
+            b loop_1_start%=
+loop_1_end%=:
+
+            add x24, x24, %x[src_stride_2]
+            add x25, x25, %x[dst_stride_2]
+            b loop_2_start%=
+loop_2_end%=:
+
+            add x21, x21, %x[src_stride_3]
+            add x22, x22, %x[dst_stride_3]
+            b loop_3_start%=
+loop_3_end%=:
+            .inst 0xd503467f  // smstop
+        )"
+        :
+        : [src] "r"(src), [tmp] "r"(tmp), [dst] "r"(dst), [beta] "r"(beta), [lut] "r"(lut), //
+          [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]),        //
+          [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]),
+          [src_stride_3] "r"(src_strides[3]), //
+          [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]),
+          [dst_stride_3] "r"(dst_strides[3]),                                   //
+          [length] "r"(shape[0])                                                //
+        : "cc", "memory",                                                       //
+          "p0", "p1", "p2", "p3", "p4",                                         //
+          "x2", "x9", "x13",                                                    //
+          "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x19", //
+          "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",                       //
+          "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",                 //
+          "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",               //
+          "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"                //
+    );
+}
+
+void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in,
+                                           void *const    tmp,
+                                           ITensor       *out,
+                                           const float    beta,
+                                           int            axis,
+                                           const Window  &window,
+                                           const float   *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(axis);
+
+    const auto *src_info = in->info();
+    const auto *dst_info = out->info();
+
+    const auto &full_shape  = dst_info->tensor_shape();
+    const auto &src_strides = src_info->strides_in_bytes();
+    const auto &dst_strides = dst_info->strides_in_bytes();
+    Strides     tmp_strides;
+
+    tmp_strides[0] = src_strides[0] * 4;
+    tmp_strides[1] = src_strides[1] * 4;
+    tmp_strides[2] = src_strides[2] * 4;
+    tmp_strides[3] = src_strides[3] * 4;
+
+    const uintptr_t k_shape[] = {
+        full_shape[0],
+        window.num_iterations(1),
+        window.num_iterations(2),
+        window.num_iterations(3),
+    };
+
+    const uintptr_t k_src_strides[] = {
+        src_strides[0],
+        src_strides[1],
+        src_strides[2],
+        src_strides[3],
+    };
+
+    const uintptr_t k_dst_strides[] = {
+        dst_strides[0],
+        dst_strides[1],
+        dst_strides[2],
+        dst_strides[3],
+    };
+
+    const uintptr_t k_src_offset = window[0].start() * src_strides[0] + //
+                                   window[1].start() * src_strides[1] + //
+                                   window[2].start() * src_strides[2] + //
+                                   window[3].start() * src_strides[3];
+
+    const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + //
+                                   window[1].start() * dst_strides[1] + //
+                                   window[2].start() * dst_strides[2] + //
+                                   window[3].start() * dst_strides[3];
+
+    const uintptr_t k_tmp_offset = window[0].start() * tmp_strides[0] + //
+                                   window[1].start() * tmp_strides[1] + //
+                                   window[2].start() * tmp_strides[2] + //
+                                   window[3].start() * tmp_strides[3];
+
+    const auto *k_src         = reinterpret_cast<const int8_t *>(in->buffer() + k_src_offset);
+    float      *tmp_float_ptr = reinterpret_cast<float *>(tmp);
+    auto       *k_tmp         = reinterpret_cast<float *>(tmp_float_ptr + k_tmp_offset);
+    auto       *k_dst         = reinterpret_cast<int8_t *>(out->buffer() + k_dst_offset);
+
+    sme2_qasymm8_signed_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
new file mode 100644
index 0000000000..0d4b7f4509
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
+template <typename ScalarType>
+void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
+{
+    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            // Get pointers
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+            // Init max value
+            auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
+
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto current_value = svld1(pg, in_ptr + x);
+                vec_max                  = svmax_m(pg, vec_max, current_value);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
+
+            auto max_val = svmaxv(all_true_pg, vec_max);
+
+            *out_ptr = max_val;
+        },
+        input, output);
+}
+
+template <typename ScalarType>
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
+{
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    Iterator in_it(in, window);
+    Iterator max_it(max, window);
+    Iterator out_it(out, window);
+
+    const auto all_true_pg = wrapper::svptrue<ScalarType>();
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
+
+            ScalarType sum{0};
+
+            /* Compute exponentials and sum */
+            {
+                /* Get max value */
+                const auto max_val  = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max  = wrapper::svdup_n(max_val);
+                const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+
+                /* Init sum to zero */
+                auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+                /* Loop over row and compute exponentials and sum */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
+                {
+                    auto vec_elements = svld1(pg, in_ptr + x);
+                    vec_elements      = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
+                    if (!is_log)
+                    {
+                        vec_elements = wrapper::svexp_z(pg, vec_elements);
+                        vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                    }
+                    svst1(pg, tmp_ptr + x, vec_elements);
+
+                    if (is_log)
+                    {
+                        vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                    }
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                sum = svaddv(all_true_pg, vec_sum);
+
+                if (is_log)
+                {
+                    sum = static_cast<ScalarType>(std::log(sum));
+                }
+                else
+                {
+                    sum = ScalarType(1) / sum;
+                }
+            }
+
+            /* Normalize exponentials */
+            {
+                /* Loop over row and compute softmax */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
+                {
+                    auto vec_in           = svld1(pg, tmp_ptr + x);
+                    auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+                    if (is_log)
+                    {
+                        normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    else
+                    {
+                        normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    svst1(pg, out_ptr + x, normalized_value);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
+            }
+        },
+        in_it, max_it, out_it);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h
new file mode 100644
index 0000000000..89a30d042f
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/impl.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H
+#define SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
+
+template <typename ScalarType>
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window);
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
new file mode 100644
index 0000000000..a8fb1d4adf
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE2 code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
+template <typename ScalarType>
+void sve2_softmax_logits_1d_quantized(
+    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
+{
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const auto  scale_beta_vec = svdup_n_f32(scale_beta);
+
+    Iterator   in_it(in, window);
+    Iterator   max_it(max, window);
+    Iterator   out_it(out, window);
+    const auto all_true_pg = wrapper::svptrue<ScalarType>();
+    using SVEType          = typename wrapper::traits::sve_vector<ScalarType>::type;
+
+    const int inc_1 = static_cast<int>(svcntw());
+    const int inc_2 = static_cast<int>(2 * svcntw());
+    const int inc_3 = static_cast<int>(3 * svcntw());
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+
+            float sum{};
+
+            /* Compute exponentials and sum */
+            {
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max = wrapper::svdup_n(max_val);
+
+                /* Init sum to zero */
+                auto vec_sum_0 = svdup_n_f32(0.f);
+                auto vec_sum_1 = svdup_n_f32(0.f);
+                auto vec_sum_2 = svdup_n_f32(0.f);
+                auto vec_sum_3 = svdup_n_f32(0.f);
+
+                /* Loop over row and compute exponentials and sum */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
+                {
+                    const auto vec_elements     = svld1(pg, in_ptr + x);
+                    const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+
+                    auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
+                    auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
+
+                    if (is_log)
+                    {
+                        vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
+                        vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
+                        vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
+                        vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+                    }
+                    else
+                    {
+                        vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
+                        vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
+                        vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
+                        vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+                    }
+
+                    svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
+                    svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
+                    svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
+                    svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1),
+                                                 svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
+                sum                = svaddv_f32(all_true_pg, vec_sum);
+
+                /* Run remaining elements */
+                x = 0;
+                if (is_log)
+                {
+                    sum = std::log(sum);
+                }
+                else
+                {
+                    sum = 256.f / sum;
+                }
+            }
+
+            /* Normalize exponentials */
+            {
+                constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
+                /* Loop over row and compute softmax */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
+                {
+                    auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
+                    auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
+                    auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
+                    auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
+
+                    svfloat32_t res_0{};
+                    svfloat32_t res_1{};
+                    svfloat32_t res_2{};
+                    svfloat32_t res_3{};
+
+                    if (is_log)
+                    {
+                        res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+                    }
+                    else
+                    {
+                        res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = svdup_n_f32(128.f);
+                            res_0                 = svsub_z(pg_0, res_0, offset_vec);
+                            res_1                 = svsub_z(pg_1, res_1, offset_vec);
+                            res_2                 = svsub_z(pg_2, res_2, offset_vec);
+                            res_3                 = svsub_z(pg_3, res_3, offset_vec);
+                        }
+                    }
+
+                    // Store value
+                    const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
+                    svst1(pg, out_ptr + x, out);
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
+            }
+        },
+        in_it, max_it, out_it);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h
new file mode 100644
index 0000000000..33fcc26cda
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H
+#define SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void sve2_softmax_logits_1d_quantized(const ITensor *in,
+                                      const ITensor *max,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      float          beta,
+                                      bool           is_log,
+                                      const Window  &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
new file mode 100644
index 0000000000..7bbb265022
--- /dev/null
+++ b/src/cpu/kernels/softmax/list.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_SOFTMAX_KERNEL(func_name)                                                                              \
+    template <bool IS_LOG>                                                                                             \
+    void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, \
+                   const float *lut_ptr)
+
+DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
+DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
+DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax);
+DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax);
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+void sme2_fp32_softmax(const ITensor *in,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       int            axis,
+                       const Window  &window,
+                       const float   *lut_ptr);
+
+void sme2_fp16_softmax(const ITensor *in,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       int            axis,
+                       const Window  &window,
+                       const float   *lut_ptr);
+
+void sme2_qasymm8_softmax_lut_512VL(const ITensor *in,
+                                    void *const    tmp,
+                                    ITensor       *out,
+                                    const float    beta,
+                                    int            axis,
+                                    const Window  &window,
+                                    const float   *lut_ptr);
+
+void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in,
+                                           void *const    tmp,
+                                           ITensor       *out,
+                                           const float    beta,
+                                           int            axis,
+                                           const Window  &window,
+                                           const float   *lut_ptr);
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+
+#undef DECLARE_SOFTMAX_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
diff --git a/src/cpu/kernels/sub/neon/fp16.cpp b/src/cpu/kernels/sub/neon/fp16.cpp
new file mode 100644
index 0000000000..023068817b
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/kernels/sub/neon/impl.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sub_same_neon_fp16(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    sub_same_neon<float16_t>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/sub/neon/impl.h b/src/cpu/kernels/sub/neon/impl.h
new file mode 100644
index 0000000000..6123f7e25a
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/impl.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/NEON/wrapper/scalar/sub.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void sub_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    bool is_sat = policy == ConvertPolicy::SATURATE;
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(T);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
+    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
+    Iterator output(dst, window);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
+
+                const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v)
+                                                        : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
+                    if (is_broadcast_input_2)
+                    {
+                        res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+                    }
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    auto       res =
+                        is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
+                    if (is_broadcast_input_2)
+                    {
+                        res = static_cast<T>(-1) * res;
+                    }
+
+                    *(output_ptr + x) = res;
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1   = *(input1_ptr + x);
+                    const auto val2   = *(input2_ptr + x);
+                    *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
+                }
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H
diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h
new file mode 100644
index 0000000000..f29571f122
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/list.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_SUB_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
+
+DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint);
+DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint);
+DECLARE_SUB_KERNEL(sub_qasymm8_neon);
+DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon);
+DECLARE_SUB_KERNEL(sub_qsymm16_neon);
+DECLARE_SUB_KERNEL(sub_same_neon_fp16);
+
+#undef DECLARE_SUB_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H
diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp
new file mode 100644
index 0000000000..b750afce6e
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/qasymm8.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sub_qasymm8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    add_sub_q8_neon_fixedpoint<uint8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
+}
+
+void sub_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..fb0bb62682
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sub_qasymm8_signed_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    add_sub_q8_neon_fixedpoint<int8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
+}
+
+void sub_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp
new file mode 100644
index 0000000000..23e4b03843
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/qsymm16.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sub_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 8;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
+
+    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
+    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
+
+    if (is_broadcast_across_x)
+    {
+        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+                const float32x4x2_t bf  = {{
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
+                }};
+                const float         bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t     a  = vld1q_s16(non_broadcast_input_ptr + x);
+                    const float32x4x2_t af = {{
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
+                    }};
+
+                    const int32x4x4_t rf = {{
+#ifdef __aarch64__
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                      : vsubq_f32(af.val[0], bf.val[0]),
+                                                 invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                      : vsubq_f32(af.val[1], bf.val[1]),
+                                                 invvscaleo)),
+#else  //__aarch64__
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                     : vsubq_f32(af.val[0], bf.val[0]),
+                                                invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                     : vsubq_f32(af.val[1], bf.val[1]),
+                                                invvscaleo)),
+#endif //__aarch64__
+                    }};
+
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                    const float32x4x2_t af = {{
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
+                    }};
+
+                    const float32x4x2_t bf = {{
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
+                    }};
+
+                    const int32x4x2_t rf = {{
+#ifdef __aarch64__
+                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+#else  //__aarch64__
+                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+#endif //__aarch64__
+                    }};
+
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute