36 files changed, 2602 insertions, 642 deletions
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 4253027231..555705bd45 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -32,6 +32,7 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/activation/list.h"
+#include "src/cpu/kernels/logistic/list.h"
 
 #include <array>
 
@@ -71,6 +72,12 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
      },
      REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
 #endif // __aarch64__
+    {"sme2_fp32_logistic",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::F32 && data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC &&
+                data.isa.sme2;
+     },
+     REGISTER_FP32_SME2(arm_compute::cpu::sme2_fp32_logistic)},
     {"sve2_qu8_activation",
      [](const ActivationDataTypeISASelectorData &data) {
          return data.dt == DataType::QASYMM8 && data.isa.sve2 &&
@@ -316,6 +323,11 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
 
     // Use squashed window
     std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+    // Collapse window with SME kernels in Y-Dim
+    if (std::string(uk->name) == "sme2_fp32_logistic")
+    {
+        win = win.collapse(win, Window::DimY);
+    }
     ICPPKernel::configure(win);
 }
 
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index d4af8bedaf..9e9137a266 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/directconv2d_output_stage/list.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -95,316 +96,6 @@ Status validate_arguments(const ITensorInfo                                 *src
 
     return Status{};
 }
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor       *src,
-                  const ITensor *bias,
-                  const Window  &window,
-                  ITensor       *dst,
-                  int            result_fixedpoint_multiplier,
-                  int            result_shift,
-                  int            result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-    execute_window_loop(
-        win,
-        [&](const Coordinates &id)
-        {
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Get bias and pointer to input
-                const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-                auto       v_in   = wrapper::vloadq(in_ptr);
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto vb = wrapper::vdup_n(
-                        *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                    v_in = wrapper::vadd(v_in, vb);
-                }
-
-                const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-                wrapper::vstore(out_ptr, v_in);
-            }
-
-            // Left-overs loop
-            for (; x < window_end_x; ++x)
-            {
-                // Get bias and pointer to input
-                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                    s_in += b;
-                }
-
-                *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-            }
-        },
-        in, out);
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor       *src,
-                  const ITensor *bias,
-                  const Window  &window,
-                  ITensor       *dst,
-                  int            result_fixedpoint_multiplier,
-                  int            result_shift,
-                  int            result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(
-        win,
-        [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Get bias and pointer to input
-                const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-                auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                    v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
-                }
-
-                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-                wrapper::vstore(out_ptr + x, v_in);
-            }
-
-            // Left-overs loop
-            for (; x < window_end_x; ++x)
-            {
-                // Get bias and pointer to input
-                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                    s_in += *bias_ptr;
-                }
-
-                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-                *(out_ptr + x)     = s_in;
-            }
-        },
-        in, bi, out);
-}
-
-// Quantized case
-template <
-    typename TOut,
-    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
-void output_stage_nchw(ITensor       *src,
-                       const ITensor *bias,
-                       const Window  &window,
-                       ITensor       *dst,
-                       int            result_fixedpoint_multiplier,
-                       int            result_shift,
-                       int            result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(
-        win,
-        [&](const Coordinates &id)
-        {
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Get bias and pointer to input
-                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-                int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8),
-                                     wrapper::vloadq(in_ptr + 12)}};
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto vb = wrapper::vdup_n(
-                        *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                    v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb),
-                             wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}};
-                }
-
-                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
-                                                               result_offset_after_shift_s32, min, max, false));
-            }
-
-            // Left-overs loop
-            for (; x < window_end_x; ++x)
-            {
-                // Get bias and pointer to input
-                int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                    s_in += b;
-                }
-
-                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-                *out_ptr =
-                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-            }
-        },
-        in, out);
-}
-template <
-    typename TOut,
-    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
-void output_stage_nhwc(ITensor       *src,
-                       const ITensor *bias,
-                       const Window  &window,
-                       ITensor       *dst,
-                       int            result_fixedpoint_multiplier,
-                       int            result_shift,
-                       int            result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(
-        win,
-        [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Get bias and pointer to input
-                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-                int32x4x4_t v_in   = {{
-                      wrapper::vloadq(in_ptr),
-                      wrapper::vloadq(in_ptr + 4),
-                      wrapper::vloadq(in_ptr + 8),
-                      wrapper::vloadq(in_ptr + 12),
-                }};
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-
-                    wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                    wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                    wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                    wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
-                }
-
-                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
-                                                               result_offset_after_shift_s32, min, max, false));
-            }
-
-            // Left-overs loop
-            for (; x < window_end_x; ++x)
-            {
-                // Get bias and pointer to input
-                const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-                int32_t    s_in   = *in_ptr;
-
-                // Accumulate bias
-                if (has_bias)
-                {
-                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                    s_in += *bias_ptr;
-                }
-
-                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-                *out_ptr =
-                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-            }
-        },
-        in, bi, out);
-}
 } // namespace
 
 void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo                                       *src,
@@ -447,24 +138,30 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo
             {
                 if (is_qasymm8_signed)
                 {
-                    _func = &output_stage_nchw<int8_t>;
+#ifdef ENABLE_QASYMM8_SIGNED_KERNELS
+                    _func = &output_stage_nchw_qs8;
+#endif // ENABLE_QASYMM8_SIGNED_KERNELS
                 }
                 else
                 {
-                    _func = &output_stage_nchw<uint8_t>;
+#ifdef ENABLE_QASYMM8_KERNELS
+                    _func = &output_stage_nchw_qu8;
+#endif // ENABLE_QASYMM8_KERNELS
                 }
                 break;
             }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ENABLE_FP16_KERNELS
             case DataType::F16:
             {
-                _func = &output_stage_nchw<float16_t>;
+                _func = &output_stage_nchw_fp16;
                 break;
             }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif // ENABLE_FP16_KERNELS
             case DataType::F32:
             {
-                _func = &output_stage_nchw<float>;
+#ifdef ENABLE_FP32_KERNELS
+                _func = &output_stage_nchw_fp32;
+#endif // ENABLE_FP32_KERNELS
                 break;
             }
             default:
@@ -481,24 +178,30 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo
             {
                 if (is_qasymm8_signed)
                 {
-                    _func = &output_stage_nhwc<int8_t>;
+#ifdef ENABLE_QASYMM8_SIGNED_KERNELS
+                    _func = &output_stage_nhwc_qs8;
+#endif // ENABLE_QASYMM8_SIGNED_KERNELS
                 }
                 else
                 {
-                    _func = &output_stage_nhwc<uint8_t>;
+#ifdef ENABLE_QASYMM8_KERNELS
+                    _func = &output_stage_nhwc_qu8;
+#endif // QASYMM8_SIGNED_KERNELS
                 }
                 break;
             }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ENABLE_FP16_KERNELS
             case DataType::F16:
             {
-                _func = &output_stage_nhwc<float16_t>;
+                _func = &output_stage_nhwc_fp16;
                 break;
             }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif // ENABLE_FP16_KERNELS
             case DataType::F32:
             {
-                _func = &output_stage_nhwc<float>;
+#ifdef ENABLE_FP32_KERNELS
+                _func = &output_stage_nhwc_fp32;
+#endif // ENABLE_FP32_KERNELS
                 break;
             }
             default:
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
index b5b2aed1ba..9c37ece3dd 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -35,10 +35,8 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/cpu/kernels/conv3d/neon/list.h"
-
-#include <algorithm>
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/conv3d/list.h"
 
 using namespace arm_compute::detail;
 
@@ -51,18 +49,16 @@ namespace kernels
 namespace
 {
 static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = {
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {"neon_fp16_directconv3d",
      [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
-     REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)},
-#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+     REGISTER_FP16_NEON(directconv3d_fp16_neon_ndhwc)},
     {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
-     REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)},
+     REGISTER_FP32_NEON(directconv3d_fp32_neon_ndhwc)},
     {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
-     REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)},
+     REGISTER_QASYMM8_NEON(directconv3d_qu8_neon_ndhwc)},
     {"neon_qasymm8_signed_directconv3d",
      [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
-     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}};
+     REGISTER_QASYMM8_SIGNED_NEON(directconv3d_qs8_neon_ndhwc)}};
 
 Status validate_arguments(const ITensorInfo *src0,
                           const ITensorInfo *src1,
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
index 5b88735e7a..87340e566e 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
@@ -684,6 +684,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                                                          DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_type() == DataType::QASYMM8_SIGNED &&
+                                        src1->data_type() == DataType::QASYMM8,
+                                    "QASYMM8_SIGNED input with QASYMM8 weights not supported");
+
     TensorShape in0_shape = src0->tensor_shape();
     TensorShape in1_shape = src1->tensor_shape();
     TensorShape out_shape = dst->tensor_shape();
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 7c1e4772a6..96ddad9d19 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -105,6 +105,13 @@ struct SoftmaxKernelDataTypeISASelectorData
     cpuinfo::CpuIsaInfo isa;
     bool                is_log;
     int                 axis;
+    uint64_t            sme2_vector_length;
+};
+
+struct ScatterKernelDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
     unsigned long       sme2_vector_length;
 };
 
@@ -124,6 +131,8 @@ using ScaleKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
 using SoftmaxKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type;
+using ScatterKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ScatterKernelDataTypeISASelectorData &data)>::type;
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index 8001482154..d7a3a77d51 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/mul/generic/neon/list.h"
+#include "src/cpu/kernels/mul/generic/sme2/list.h"
 
 #include <arm_neon.h>
 
@@ -317,6 +318,41 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
     }
 }
 
+bool mul_q8_sme_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
+{
+    const auto        &in0_shape = src0->tensor_shape();
+    const auto        &in1_shape = src1->tensor_shape();
+    const unsigned int dst_dims  = dst->num_dimensions();
+
+    // Calculate Scale
+    const auto iq0        = src0->quantization_info().uniform();
+    const auto iq1        = src1->quantization_info().uniform();
+    const auto oq         = dst->quantization_info().uniform();
+    const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
+    const auto max_result = multiplier * (127) * (127) + static_cast<float>(oq.offset);
+    const auto min_result = multiplier * (-128) * (-128) + static_cast<float>(oq.offset);
+
+    // Does not support broadcasting on x
+    // Does not support dims > 4D output, unless input shapes are identical (therefore collapsible)
+    // Checks whether CPU has SME2 Available
+    if (in0_shape.x() == in1_shape.x() && CPUInfo::get().has_sme2() && (in0_shape == in1_shape || dst_dims <= 4))
+    {
+        // Check if multiplier cannot be stored as a 14.18 signed fixed-point number
+        if (multiplier < -8191.f || multiplier > 8191.f)
+        {
+            return false;
+        }
+        // It might not be possible to store the result as a 14.18 signed fixed-point number.
+        if (max_result > 8191.f || min_result < -8191.f)
+        {
+            return false;
+        }
+        // Passed all checks
+        return true;
+    }
+    return false;
+}
+
 bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
                                      const ITensorInfo *src1,
                                      const ITensorInfo *dst,
@@ -1563,7 +1599,11 @@ void CpuMulKernel::configure(ITensorInfo   *src1,
         case DataType::QASYMM8_SIGNED:
             if (dt_input2 == DataType::QASYMM8_SIGNED)
             {
-                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                if (mul_q8_sme_possible(src1, src2, dst, scale) && rounding_policy == RoundingPolicy::TO_ZERO)
+                {
+                    _func_quantized = REGISTER_QASYMM8_SIGNED_SME2(arm_compute::cpu::sme2_q8_signed_mul);
+                }
+                else if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
                 {
                     _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
                 }
diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp
index b444a25ff7..c6e0dd3a5e 100644
--- a/src/cpu/kernels/CpuPermuteKernel.cpp
+++ b/src/cpu/kernels/CpuPermuteKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,15 +97,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
 template <typename T>
 void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm)
 {
-    const DataLayout src_layout = src->info()->data_layout();
-
     // Source window
     Window window_src = window;
 
     // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
     // we have to fall back to C++
-    if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) ||
-        (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}))
+    if (perm == PermutationVector{2U, 0U, 1U} || perm == PermutationVector{1U, 2U, 0U})
     {
         window_src.set(Window::DimX,
                        Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
@@ -128,49 +125,16 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
     Iterator src_it(src, window_src);
     Iterator dst_it(dst, window_dst);
 
-    int in_row_stride     = 0;
-    int in_col_stride     = 0;
-    int in_channel_stride = 0;
-    int in_batch_stride   = 0;
-    int n_cols            = 0;
-    int n_rows            = 0;
-    int n_channels        = 0;
-    int n_batches         = 0;
-
-    switch (src_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            in_row_stride     = src->info()->strides_in_bytes().y() / sizeof(T);
-            in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T);
-            in_batch_stride   = src->info()->strides_in_bytes()[3] / sizeof(T);
-            n_cols            = src->info()->tensor_shape().x();
-            n_rows            = window_src.y().step();
-            n_channels        = src->info()->tensor_shape().z();
-            n_batches         = src->info()->tensor_shape()[3];
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            in_col_stride   = src->info()->strides_in_bytes().y() / sizeof(T);
-            in_row_stride   = src->info()->strides_in_bytes().z() / sizeof(T);
-            in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T);
-            n_channels      = src->info()->tensor_shape().x();
-            n_cols          = window_src.y().step();
-            n_rows          = src->info()->tensor_shape().z();
-            n_batches       = src->info()->tensor_shape()[3];
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Invalid source data layout.");
-            break;
-        }
-    }
-
     // CHW -> HWC
-    if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U})
+    if (perm == PermutationVector{2U, 0U, 1U})
     {
+        const int in_row_stride      = src->info()->strides_in_bytes().y() / sizeof(T);
+        const int in_channel_stride  = src->info()->strides_in_bytes().z() / sizeof(T);
+        const int in_batch_stride    = src->info()->strides_in_bytes()[3] / sizeof(T);
+        const int n_cols             = src->info()->tensor_shape().x();
+        const int n_rows             = window_src.y().step();
+        const int n_channels         = src->info()->tensor_shape().z();
+        const int n_batches          = src->info()->tensor_shape()[3];
         const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_col_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().z() / sizeof(T);
@@ -188,8 +152,15 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
             src_it, dst_it);
     }
     // HWC -> CHW
-    else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})
+    else if (perm == PermutationVector{1U, 2U, 0U})
     {
+        const int in_col_stride      = src->info()->strides_in_bytes().y() / sizeof(T);
+        const int in_row_stride      = src->info()->strides_in_bytes().z() / sizeof(T);
+        const int in_batch_stride    = src->info()->strides_in_bytes()[3] / sizeof(T);
+        const int n_channels         = src->info()->tensor_shape().x();
+        const int n_cols             = window_src.y().step();
+        const int n_rows             = src->info()->tensor_shape().z();
+        const int n_batches          = src->info()->tensor_shape()[3];
         const int out_col_stride     = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index 241e58fbce..78b08f19d2 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -233,7 +233,9 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors)
 
     if (!src_has_holes && !dst_has_holes)
     {
-        std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
+        size_t split_dimension;
+
+        std::tie(win, split_dimension) = calculate_squashed_or_max_window(*dst_info);
         /*
             Copy the tensor per window. If the src and dst tensors
             are contiguous memory allocations without any holes or
@@ -241,7 +243,15 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors)
             we can use use a single memcopy call to copy the whole
             window in reshape_tensor_per_window fn
         */
-        _reshape_tensor_fn = reshape_tensor_per_window;
+        if (split_dimension != Window::DimY)
+        {
+            // Fall back when split dimension doesn't equal Window::DimY
+            _reshape_tensor_fn = reshape_tensor_per_row;
+        }
+        else
+        {
+            _reshape_tensor_fn = reshape_tensor_per_window;
+        }
     }
     else
     {
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index ce566fd9e2..acad441b76 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
-#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPURESHAPEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPURESHAPEKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
@@ -74,21 +74,10 @@ public:
      */
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
-    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
-      *
-      * @return The split dimension.
-      */
-    size_t get_split_dimension() const
-    {
-        return _split_dimension;
-    }
-
 private:
-    size_t _split_dimension{Window::DimY};
-
     std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{};
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPURESHAPEKERNEL_H
diff --git a/src/cpu/kernels/CpuScatterKernel.cpp b/src/cpu/kernels/CpuScatterKernel.cpp
new file mode 100644
index 0000000000..bc0fa724b0
--- /dev/null
+++ b/src/cpu/kernels/CpuScatterKernel.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuScatterKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+
+/* Scatter */
+static const std::vector<typename CpuScatterKernel::ScatterKernel> available_kernels = {
+
+};
+
+} // namespace
+
+const std::vector<typename CpuScatterKernel::ScatterKernel> &CpuScatterKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+void CpuScatterKernel::configure(const ITensorInfo *src,
+                                 const ITensorInfo *updates,
+                                 const ITensorInfo *indices,
+                                 ITensorInfo       *dst,
+                                 const ScatterInfo &info)
+{
+    ARM_COMPUTE_UNUSED(src);
+    ARM_COMPUTE_UNUSED(updates);
+    ARM_COMPUTE_UNUSED(indices);
+    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_UNUSED(_run_method);
+}
+
+Status CpuScatterKernel::validate(const ITensorInfo *src,
+                                  const ITensorInfo *updates,
+                                  const ITensorInfo *indices,
+                                  const ITensorInfo *dst,
+                                  const ScatterInfo &info)
+{
+    ARM_COMPUTE_UNUSED(src);
+    ARM_COMPUTE_UNUSED(updates);
+    ARM_COMPUTE_UNUSED(indices);
+    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_UNUSED(info);
+
+    return Status{ErrorCode::RUNTIME_ERROR, "No configuration implemented yet."};
+}
+
+void CpuScatterKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(tensors);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_UNUSED(_run_method);
+}
+
+const char *CpuScatterKernel::name() const
+{
+    return _name.c_str();
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuScatterKernel.h b/src/cpu/kernels/CpuScatterKernel.h
new file mode 100644
index 0000000000..77c436034f
--- /dev/null
+++ b/src/cpu/kernels/CpuScatterKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUSCATTERKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSCATTERKERNEL_H
+
+#include "arm_compute/function_info/ScatterInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Arm(R) Neon(TM) kernel to perform the ScatterND operation */
+class CpuScatterKernel : public ICpuKernel<CpuScatterKernel>
+{
+private:
+    using ScatterKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const ScatterInfo, const Window &)>::type;
+
+public:
+    CpuScatterKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScatterKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  src     Input tensor info for the source matrix.
+     * @param[in]  updates Input tensor info for the Update matrix. Data type supported: same as @p src
+     * @param[in]  indices Input tensor info for the Indices matrix. Data type supported: U32.
+     * @param[out] dst     Output tensor info. Data type supported: same as @p src
+     * @param[in]  info    Attributes for Scatter Kernel
+     */
+    void configure(const ITensorInfo *src,
+                   const ITensorInfo *updates,
+                   const ITensorInfo *indices,
+                   ITensorInfo       *dst,
+                   const ScatterInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuScatterKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *updates,
+                           const ITensorInfo *indices,
+                           const ITensorInfo *dst,
+                           const ScatterInfo &info);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+    struct ScatterKernel
+    {
+        const char      *name;
+        ScatterKernelPtr ukernel;
+    };
+
+    static const std::vector<ScatterKernel> &get_available_kernels();
+
+private:
+    ScatterKernelPtr _run_method{nullptr};
+    std::string      _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUSCATTERKERNEL_H
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
index 297ba63826..f8e9d123b5 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, con
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
                                                            get_output_shape(src, biases != nullptr));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        if (!src->quantization_info().is_dynamic())
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        }
     }
 
     return Status{};
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index e2a27675b3..72fafca1bb 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -52,7 +52,7 @@ namespace kernel
   *
   *
   */
-template <typename TypeInput, typename TypeOutput>
+template <typename TypeInput, typename TypeWeight, typename TypeOutput>
 class CpuGemmAssemblyWrapperKernel final : public INEKernel
 {
 public:
@@ -101,7 +101,7 @@ public:
      * @param[in] kernel          Pointer to an assembly kernel implementation.
      * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name.
      */
-    void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
+    void configure(arm_gemm::GemmCommon<TypeInput, TypeWeight, TypeOutput> *kernel, std::string kernel_name_tag)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
         _kernel = kernel;
@@ -131,8 +131,8 @@ public:
     }
 
 private:
-    arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string                                  _name;
+    arm_gemm::GemmCommon<TypeInput, TypeWeight, TypeOutput> *_kernel;
+    std::string                                              _name;
 };
 } // namespace kernel
 } // namespace cpu
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 941fed0ba8..cbc8be416e 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -277,8 +277,8 @@ struct Nothing
 {
 };
 
-template <typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
+template <typename Tlop, typename Trop, typename Tret>
+using UniqueGemmCommon = std::unique_ptr<GemmCommon<Tlop, Trop, Tret>>;
 
 /* Low level API calls.
  * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
@@ -288,13 +288,13 @@ using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
 template <typename Top, typename Tret, class OutputStage = Nothing>
 KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
 
-template <typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
+template <typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing>
+UniqueGemmCommon<Tlop, Trop, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
 
-template <typename Top, typename Tret, class OutputStage = Nothing>
+template <typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing>
 std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
 
-template <typename Top, typename Tret, class OutputStage = Nothing>
+template <typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing>
 bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const OutputStage & = {});
 
 } // namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/convolution_parameters.hpp b/src/cpu/kernels/assembly/convolution_parameters.hpp
index a6cf96344c..09b73ca409 100644
--- a/src/cpu/kernels/assembly/convolution_parameters.hpp
+++ b/src/cpu/kernels/assembly/convolution_parameters.hpp
@@ -61,6 +61,8 @@ struct ConvolutionParameters
     int64_t output_stride_w;
     int64_t output_stride_h;
     //          output_channels not included as they do not affect the input.
+    int64_t dilation_w;
+    int64_t dilation_h;
     int64_t padding_top;
     int64_t padding_left;
     float   padding_value;
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 45d1e43274..d5676c134e 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
 {
 // Avoid circular dependency with arm_gemm.hpp
 struct GemmConfig;
+struct Requantize32;
 
 // Abstract class for the GEMM/GEMV functions.
 //
@@ -160,6 +161,12 @@ public:
     {
     }
 
+    /*** "Quantization update" interface (optional) ***/
+    /* Update quantization parameters at run time */
+    virtual void update_quantization_parameters(const Requantize32 &)
+    {
+    }
+
     /*** Convolution interface (optional) ***/
     /* Set the convolution parameters. */
     virtual void set_convolution_parameters(ConvolutionParameters)
@@ -189,7 +196,7 @@ public:
  * 'set_arrays' to capture the provided arguments in protected class
  * members, as essentially any implementation will need these.
  */
-template <typename To, typename Tr>
+template <typename To, typename Tw, typename Tr>
 class GemmCommon : public IGemmCommon
 {
 protected:
@@ -197,7 +204,7 @@ protected:
     int       _lda               = 0;
     int       _A_batch_stride    = 0;
     int       _A_multi_stride    = 0;
-    const To *_Bptr              = nullptr;
+    const Tw *_Bptr              = nullptr;
     int       _ldb               = 0;
     int       _B_multi_stride    = 0;
     Tr       *_Cptr              = nullptr;
@@ -214,7 +221,7 @@ public:
                             const int                                     lda,
                             const int                                     A_batch_stride,
                             const int                                     A_multi_stride,
-                            const To                                     *B,
+                            const Tw                                     *B,
                             const int                                     ldb,
                             /* batches share B */ const int               B_multi_stride,
                             Tr                                           *C,
@@ -254,7 +261,7 @@ public:
                             const void                                   *bias,
                             /* no row or batch stride needed */ const int bias_multi_stride) override
     {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const Tw *>(B), ldb,
                    B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
                    static_cast<const Tr *>(bias), bias_multi_stride);
     }
@@ -262,17 +269,17 @@ public:
     /*** "Pretransposed" interface ***/
 
     /* Compute col sums over all columns */
-    virtual void requantize_bias(void *, const To *, const int, const int){};
+    virtual void requantize_bias(void *, const Tw *, const int, const int){};
 
     /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
     /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int, bool){};
+    virtual void pretranspose_B_array(void *, const Tw *, const int, const int, bool){};
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
     void pretranspose_B_array_generic(
         void *out, const void *in, const int row_stride, const int multi_stride, bool transposed) override
     {
-        pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride, transposed);
+        pretranspose_B_array(out, static_cast<const Tw *>(in), row_stride, multi_stride, transposed);
     }
 
     /* Threaded versions of the above.
@@ -280,7 +287,7 @@ public:
      * just calls the non-threaded functions to do the work.  This is valid as with window size of 1 the only
      * legal values for start and end are 0 and 1 respectively. */
     virtual void pretranspose_B_array_part(
-        void *out, const To *in, const int row_stride, const int multi_stride, bool transposed, size_t, size_t)
+        void *out, const Tw *in, const int row_stride, const int multi_stride, bool transposed, size_t, size_t)
     {
         pretranspose_B_array(out, in, row_stride, multi_stride, transposed);
     };
@@ -293,7 +300,7 @@ public:
                                            size_t      start,
                                            size_t      end) override
     {
-        pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, transposed, start, end);
+        pretranspose_B_array_part(out, static_cast<const Tw *>(in), row_stride, multi_stride, transposed, start, end);
     }
 
     /*** Indirect interface ***/
diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/generic/neon/float_impl.h
index 082c60be29..5b5611a02f 100644
--- a/src/cpu/kernels/conv3d/neon/list.h
+++ b/src/cpu/kernels/conv3d/generic/neon/float_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,21 +21,25 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
-#define SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_FLOAT_IMPL_H
+#define ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_FLOAT_IMPL_H
 
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/cpu/kernels/conv3d/neon/quantized.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
+namespace kernels
+{
+
 template <typename T>
 void directconv3d_float_neon_ndhwc(const ITensor    *src0,
                                    const ITensor    *src1,
@@ -192,6 +196,7 @@ void directconv3d_float_neon_ndhwc(const ITensor    *src0,
         out);
 }
 
+} // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
+#endif // ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_FLOAT_IMPL_H
diff --git a/src/cpu/kernels/conv3d/generic/neon/fp16.cpp b/src/cpu/kernels/conv3d/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..1737556e51
--- /dev/null
+++ b/src/cpu/kernels/conv3d/generic/neon/fp16.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/conv3d/generic/neon/float_impl.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void directconv3d_fp16_neon_ndhwc(const ITensor    *src0,
+                                  const ITensor    *src1,
+                                  const ITensor    *src2,
+                                  ITensor          *dst,
+                                  const Conv3dInfo &conv_info,
+                                  const Window     &window)
+{
+    directconv3d_float_neon_ndhwc<float16_t>(src0, src1, src2, dst, conv_info, window);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/conv3d/generic/neon/fp32.cpp b/src/cpu/kernels/conv3d/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..1cd0793442
--- /dev/null
+++ b/src/cpu/kernels/conv3d/generic/neon/fp32.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/conv3d/generic/neon/float_impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+
+void directconv3d_fp32_neon_ndhwc(const ITensor    *src0,
+                                  const ITensor    *src1,
+                                  const ITensor    *src2,
+                                  ITensor          *dst,
+                                  const Conv3dInfo &conv_info,
+                                  const Window     &window)
+{
+    directconv3d_float_neon_ndhwc<float>(src0, src1, src2, dst, conv_info, window);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/conv3d/generic/neon/qasymm8.cpp b/src/cpu/kernels/conv3d/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..d0cb6fc1c1
--- /dev/null
+++ b/src/cpu/kernels/conv3d/generic/neon/qasymm8.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/conv3d/generic/neon/quantized_impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+
+void directconv3d_qu8_neon_ndhwc(const ITensor    *src0,
+                                 const ITensor    *src1,
+                                 const ITensor    *src2,
+                                 ITensor          *dst,
+                                 const Conv3dInfo &conv_info,
+                                 const Window     &window)
+{
+    directconv3d_quantized_neon_ndhwc<uint8_t>(src0, src1, src2, dst, conv_info, window);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/conv3d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/conv3d/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..adffc1a3f8
--- /dev/null
+++ b/src/cpu/kernels/conv3d/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/conv3d/generic/neon/quantized_impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+
+void directconv3d_qs8_neon_ndhwc(const ITensor    *src0,
+                                 const ITensor    *src1,
+                                 const ITensor    *src2,
+                                 ITensor          *dst,
+                                 const Conv3dInfo &conv_info,
+                                 const Window     &window)
+{
+    directconv3d_quantized_neon_ndhwc<int8_t>(src0, src1, src2, dst, conv_info, window);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/generic/neon/quantized_impl.h
index f0fc9b5a71..b6b41035f8 100644
--- a/src/cpu/kernels/conv3d/neon/quantized.h
+++ b/src/cpu/kernels/conv3d/generic/neon/quantized_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
-#define SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
+#ifndef ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_QUANTIZED_IMPL_H
+#define ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_QUANTIZED_IMPL_H
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -37,6 +38,8 @@ namespace arm_compute
 {
 namespace cpu
 {
+namespace kernels
+{
 template <typename T>
 void directconv3d_quantized_neon_ndhwc(const ITensor    *src0,
                                        const ITensor    *src1,
@@ -270,6 +273,7 @@ void directconv3d_quantized_neon_ndhwc(const ITensor    *src0,
         },
         out);
 }
+} // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
+#endif // ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_QUANTIZED_IMPL_H
diff --git a/src/cpu/kernels/conv3d/list.h b/src/cpu/kernels/conv3d/list.h
new file mode 100644
index 0000000000..256d28825d
--- /dev/null
+++ b/src/cpu/kernels/conv3d/list.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CONV3D_LIST_H
+#define ACL_SRC_CPU_KERNELS_CONV3D_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+
+#define DECLARE_CONV3D_KERNEL(func_name)                                                        \
+    void func_name(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, \
+                   const Conv3dInfo &conv_info, const Window &window)
+
+DECLARE_CONV3D_KERNEL(directconv3d_fp16_neon_ndhwc);
+DECLARE_CONV3D_KERNEL(directconv3d_fp32_neon_ndhwc);
+DECLARE_CONV3D_KERNEL(directconv3d_qu8_neon_ndhwc);
+DECLARE_CONV3D_KERNEL(directconv3d_qs8_neon_ndhwc);
+#undef DECLARE_CONV3D_KERNEL
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CONV3D_LIST_H
diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h b/src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h
new file mode 100644
index 0000000000..266fa68ab2
--- /dev/null
+++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_FLOAT_IMPL_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_FLOAT_IMPL_H
+
+#include "arm_compute/core/Helpers.h" // Iterator
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void output_stage_nchw_fp(ITensor       *src,
+                          const ITensor *bias,
+                          const Window  &window,
+                          ITensor       *dst,
+                          int            result_fixedpoint_multiplier,
+                          int            result_shift,
+                          int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+                auto       v_in   = wrapper::vloadq(in_ptr);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                    v_in = wrapper::vadd(v_in, vb);
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, v_in);
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
+
+                *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
+            }
+        },
+        in, out);
+}
+
+template <typename T>
+void output_stage_nhwc_fp(ITensor       *src,
+                          const ITensor *bias,
+                          const Window  &window,
+                          ITensor       *dst,
+                          int            result_fixedpoint_multiplier,
+                          int            result_shift,
+                          int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+                auto       v_in   = wrapper::vloadq(in_ptr + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                wrapper::vstore(out_ptr + x, v_in);
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                *(out_ptr + x)     = s_in;
+            }
+        },
+        in, bi, out);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_FLOAT_IMPL_H
diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp16.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..d7550da721
--- /dev/null
+++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp16.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void output_stage_nhwc_fp16(ITensor       *src,
+                            const ITensor *bias,
+                            const Window  &window,
+                            ITensor       *dst,
+                            int            result_fixedpoint_multiplier,
+                            int            result_shift,
+                            int            result_offset_after_shift)
+{
+    output_stage_nhwc_fp<float16_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                    result_offset_after_shift);
+}
+
+void output_stage_nchw_fp16(ITensor       *src,
+                            const ITensor *bias,
+                            const Window  &window,
+                            ITensor       *dst,
+                            int            result_fixedpoint_multiplier,
+                            int            result_shift,
+                            int            result_offset_after_shift)
+{
+    output_stage_nchw_fp<float16_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                    result_offset_after_shift);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp32.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..05dec370b9
--- /dev/null
+++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp32.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void output_stage_nhwc_fp32(ITensor       *src,
+                            const ITensor *bias,
+                            const Window  &window,
+                            ITensor       *dst,
+                            int            result_fixedpoint_multiplier,
+                            int            result_shift,
+                            int            result_offset_after_shift)
+{
+    output_stage_nhwc_fp<float>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                result_offset_after_shift);
+}
+
+void output_stage_nchw_fp32(ITensor       *src,
+                            const ITensor *bias,
+                            const Window  &window,
+                            ITensor       *dst,
+                            int            result_fixedpoint_multiplier,
+                            int            result_shift,
+                            int            result_offset_after_shift)
+{
+    output_stage_nchw_fp<float>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                result_offset_after_shift);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..dbdf951b6f
--- /dev/null
+++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void output_stage_nhwc_qu8(ITensor       *src,
+                           const ITensor *bias,
+                           const Window  &window,
+                           ITensor       *dst,
+                           int            result_fixedpoint_multiplier,
+                           int            result_shift,
+                           int            result_offset_after_shift)
+{
+    output_stage_nhwc_quant<uint8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                     result_offset_after_shift);
+}
+
+void output_stage_nchw_qu8(ITensor       *src,
+                           const ITensor *bias,
+                           const Window  &window,
+                           ITensor       *dst,
+                           int            result_fixedpoint_multiplier,
+                           int            result_shift,
+                           int            result_offset_after_shift)
+{
+    output_stage_nchw_quant<uint8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                     result_offset_after_shift);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..c00fe87161
--- /dev/null
+++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void output_stage_nhwc_qs8(ITensor       *src,
+                           const ITensor *bias,
+                           const Window  &window,
+                           ITensor       *dst,
+                           int            result_fixedpoint_multiplier,
+                           int            result_shift,
+                           int            result_offset_after_shift)
+{
+    output_stage_nhwc_quant<int8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                    result_offset_after_shift);
+}
+
+void output_stage_nchw_qs8(ITensor       *src,
+                           const ITensor *bias,
+                           const Window  &window,
+                           ITensor       *dst,
+                           int            result_fixedpoint_multiplier,
+                           int            result_shift,
+                           int            result_offset_after_shift)
+{
+    output_stage_nchw_quant<int8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift,
+                                    result_offset_after_shift);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h b/src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h
new file mode 100644
index 0000000000..f74ed55ad0
--- /dev/null
+++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_QUANTIZED_IMPL_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_QUANTIZED_IMPL_H
+
+#include "arm_compute/core/Helpers.h" // Iterator
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+
+template <typename TOut>
+void output_stage_nchw_quant(ITensor       *src,
+                             const ITensor *bias,
+                             const Window  &window,
+                             ITensor       *dst,
+                             int            result_fixedpoint_multiplier,
+                             int            result_shift,
+                             int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8),
+                                     wrapper::vloadq(in_ptr + 12)}};
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                    v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb),
+                             wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}};
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+            }
+        },
+        in, out);
+}
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nhwc_quant(ITensor       *src,
+                             const ITensor *bias,
+                             const Window  &window,
+                             ITensor       *dst,
+                             int            result_fixedpoint_multiplier,
+                             int            result_shift,
+                             int            result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in   = {{
+                      wrapper::vloadq(in_ptr),
+                      wrapper::vloadq(in_ptr + 4),
+                      wrapper::vloadq(in_ptr + 8),
+                      wrapper::vloadq(in_ptr + 12),
+                }};
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+
+                    wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                    wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                    wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                    wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32_t    s_in   = *in_ptr;
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+            }
+        },
+        in, bi, out);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_QUANTIZED_IMPL_H
diff --git a/src/cpu/kernels/directconv2d_output_stage/list.h b/src/cpu/kernels/directconv2d_output_stage/list.h
new file mode 100644
index 0000000000..9372269bca
--- /dev/null
+++ b/src/cpu/kernels/directconv2d_output_stage/list.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_LIST_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_LIST_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+
+#define DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(func_name)                               \
+    void func_name(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, \
+                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_fp32);
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_fp16);
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_fp32);
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_fp16);
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_qs8);
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_qu8);
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_qs8);
+DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_qu8);
+
+#undef DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_LIST_H
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
index 404d070a37..580fdc3e8f 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,7 +81,7 @@ void vector_matrix_multiply_f32(
             // window_end_x is computed above which may cause out-of-bound writes to the dst.
             for (; x < (window_end_x - window_step_x); x += window_step_x)
             {
-                if (x > width_matrix_b)
+                if (x >= width_matrix_b)
                 {
                     return;
                 }
@@ -203,7 +203,7 @@ void vector_matrix_multiply_f32(
             // Left-over loop
             for (; x < window_end_x; ++x)
             {
-                if (x > width_matrix_b)
+                if (x >= width_matrix_b)
                 {
                     return;
                 }
@@ -309,9 +309,21 @@ void matrix_matrix_multiply_f32(
     Iterator inb(rhs, win_b);
     Iterator out(dst, window);
 
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+    // End address of matrix B at batch number n
+    const float *end_addr_mtx_b_at_batch_n =
+        reinterpret_cast<const float *>(inb.ptr()) + rhs->info()->dimension(0) * rhs->info()->dimension(1);
+    std::vector<const float *> end_addr_mtx_b_per_batch = {};
+    const bool                 multiply_alpha           = !(helpers::float_ops::is_one(alpha));
+    const float32x4_t          alpha_f32                = vdupq_n_f32(alpha);
+    const size_t               out_dim2                 = static_cast<int>(dst->info()->dimension(2));
 
-    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+    for (size_t b = 0; b < out_dim2; ++b)
+    {
+        // Store the ptrs to the last elem in the tensor for each batch
+        end_addr_mtx_b_per_batch.push_back(end_addr_mtx_b_at_batch_n);
+        end_addr_mtx_b_at_batch_n +=
+            rhs->info()->dimension(2) != 1 ? rhs->info()->dimension(0) * rhs->info()->dimension(1) : 0;
+    }
 
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
@@ -341,220 +353,374 @@ void matrix_matrix_multiply_f32(
 #endif /* __arm__ */
 
             auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
+
+            ARM_COMPUTE_ERROR_ON(end_addr_mtx_b_per_batch.size() == 0);
+            if (mtx_b1 < end_addr_mtx_b_per_batch[id.z()])
             {
-                float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
-                float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
-                float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
-                float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+                for (; mtx_b0 < (mtx_b0_end_addr - 32);)
+                {
+                    float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+                    float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+                    float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+                    float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
 
-                float32x4_t b00 = vld1q_f32(mtx_b0);
-                float32x4_t b10 = vld1q_f32(mtx_b1);
-                float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
-                float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+                    float32x4_t b00 = vld1q_f32(mtx_b0);
+                    float32x4_t b10 = vld1q_f32(mtx_b1);
+                    float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+                    float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
 
 #if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b00, a0);
-                acc10 = vmlaq_f32(acc10, b00, a1);
-                acc20 = vmlaq_f32(acc20, b00, a2);
-                acc30 = vmlaq_f32(acc30, b00, a3);
-
-                float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
-                float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
-                float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
-                float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b10, a0);
-                acc11 = vmlaq_f32(acc11, b10, a1);
-                acc21 = vmlaq_f32(acc21, b10, a2);
-                acc31 = vmlaq_f32(acc31, b10, a3);
-
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b01, a4);
-                acc10 = vmlaq_f32(acc10, b01, a5);
-                acc20 = vmlaq_f32(acc20, b01, a6);
-                acc30 = vmlaq_f32(acc30, b01, a7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b11, a4);
-                acc11 = vmlaq_f32(acc11, b11, a5);
-                acc21 = vmlaq_f32(acc21, b11, a6);
-                acc31 = vmlaq_f32(acc31, b11, a7);
-
-                mtx_a0 += 8;
-                mtx_b0 += 8;
-                mtx_b1 += 8;
-
-                a0 = vld1q_dup_f32(mtx_a0 + 0);
-                a1 = vld1q_dup_f32(mtx_a0 + 1);
-                a2 = vld1q_dup_f32(mtx_a0 + 2);
-                a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-                b00 = vld1q_f32(mtx_b0);
-                b10 = vld1q_f32(mtx_b1);
-                b01 = vld1q_f32(mtx_b0 + 4);
-                b11 = vld1q_f32(mtx_b1 + 4);
-
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b00, a0);
-                acc10 = vmlaq_f32(acc10, b00, a1);
-                acc20 = vmlaq_f32(acc20, b00, a2);
-                acc30 = vmlaq_f32(acc30, b00, a3);
-
-                a4 = vld1q_dup_f32(mtx_a0 + 4);
-                a5 = vld1q_dup_f32(mtx_a0 + 5);
-                a6 = vld1q_dup_f32(mtx_a0 + 6);
-                a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b10, a0);
-                acc11 = vmlaq_f32(acc11, b10, a1);
-                acc21 = vmlaq_f32(acc21, b10, a2);
-                acc31 = vmlaq_f32(acc31, b10, a3);
-
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b01, a4);
-                acc10 = vmlaq_f32(acc10, b01, a5);
-                acc20 = vmlaq_f32(acc20, b01, a6);
-                acc30 = vmlaq_f32(acc30, b01, a7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b11, a4);
-                acc11 = vmlaq_f32(acc11, b11, a5);
-                acc21 = vmlaq_f32(acc21, b11, a6);
-                acc31 = vmlaq_f32(acc31, b11, a7);
-
-                mtx_a0 += 8;
-                mtx_b0 += 8;
-                mtx_b1 += 8;
-
-                a0  = vld1q_dup_f32(mtx_a0 + 0);
-                a1  = vld1q_dup_f32(mtx_a0 + 1);
-                a2  = vld1q_dup_f32(mtx_a0 + 2);
-                a3  = vld1q_dup_f32(mtx_a0 + 3);
-                b00 = vld1q_f32(mtx_b0);
-                b10 = vld1q_f32(mtx_b1);
-                b01 = vld1q_f32(mtx_b0 + 4);
-                b11 = vld1q_f32(mtx_b1 + 4);
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b10, a0);
+                    acc11 = vmlaq_f32(acc11, b10, a1);
+                    acc21 = vmlaq_f32(acc21, b10, a2);
+                    acc31 = vmlaq_f32(acc31, b10, a3);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b11, a4);
+                    acc11 = vmlaq_f32(acc11, b11, a5);
+                    acc21 = vmlaq_f32(acc21, b11, a6);
+                    acc31 = vmlaq_f32(acc31, b11, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+                    mtx_b1 += 8;
+
+                    a0 = vld1q_dup_f32(mtx_a0 + 0);
+                    a1 = vld1q_dup_f32(mtx_a0 + 1);
+                    a2 = vld1q_dup_f32(mtx_a0 + 2);
+                    a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                    b00 = vld1q_f32(mtx_b0);
+                    b10 = vld1q_f32(mtx_b1);
+                    b01 = vld1q_f32(mtx_b0 + 4);
+                    b11 = vld1q_f32(mtx_b1 + 4);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b10, a0);
+                    acc11 = vmlaq_f32(acc11, b10, a1);
+                    acc21 = vmlaq_f32(acc21, b10, a2);
+                    acc31 = vmlaq_f32(acc31, b10, a3);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b11, a4);
+                    acc11 = vmlaq_f32(acc11, b11, a5);
+                    acc21 = vmlaq_f32(acc21, b11, a6);
+                    acc31 = vmlaq_f32(acc31, b11, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+                    mtx_b1 += 8;
+
+                    a0  = vld1q_dup_f32(mtx_a0 + 0);
+                    a1  = vld1q_dup_f32(mtx_a0 + 1);
+                    a2  = vld1q_dup_f32(mtx_a0 + 2);
+                    a3  = vld1q_dup_f32(mtx_a0 + 3);
+                    b00 = vld1q_f32(mtx_b0);
+                    b10 = vld1q_f32(mtx_b1);
+                    b01 = vld1q_f32(mtx_b0 + 4);
+                    b11 = vld1q_f32(mtx_b1 + 4);
 
 #if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b00, a0);
-                acc10 = vmlaq_f32(acc10, b00, a1);
-                acc20 = vmlaq_f32(acc20, b00, a2);
-                acc30 = vmlaq_f32(acc30, b00, a3);
-
-                a4 = vld1q_dup_f32(mtx_a0 + 4);
-                a5 = vld1q_dup_f32(mtx_a0 + 5);
-                a6 = vld1q_dup_f32(mtx_a0 + 6);
-                a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b10, a0);
-                acc11 = vmlaq_f32(acc11, b10, a1);
-                acc21 = vmlaq_f32(acc21, b10, a2);
-                acc31 = vmlaq_f32(acc31, b10, a3);
-
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b01, a4);
-                acc10 = vmlaq_f32(acc10, b01, a5);
-                acc20 = vmlaq_f32(acc20, b01, a6);
-                acc30 = vmlaq_f32(acc30, b01, a7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b11, a4);
-                acc11 = vmlaq_f32(acc11, b11, a5);
-                acc21 = vmlaq_f32(acc21, b11, a6);
-                acc31 = vmlaq_f32(acc31, b11, a7);
-
-                mtx_a0 += 8;
-                mtx_b0 += 8;
-                mtx_b1 += 8;
-
-                a0  = vld1q_dup_f32(mtx_a0 + 0);
-                a1  = vld1q_dup_f32(mtx_a0 + 1);
-                a2  = vld1q_dup_f32(mtx_a0 + 2);
-                a3  = vld1q_dup_f32(mtx_a0 + 3);
-                b00 = vld1q_f32(mtx_b0);
-                b10 = vld1q_f32(mtx_b1);
-                b01 = vld1q_f32(mtx_b0 + 4);
-                b11 = vld1q_f32(mtx_b1 + 4);
-
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b00, a0);
-                acc10 = vmlaq_f32(acc10, b00, a1);
-                acc20 = vmlaq_f32(acc20, b00, a2);
-                acc30 = vmlaq_f32(acc30, b00, a3);
-
-                a4 = vld1q_dup_f32(mtx_a0 + 4);
-                a5 = vld1q_dup_f32(mtx_a0 + 5);
-                a6 = vld1q_dup_f32(mtx_a0 + 6);
-                a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b10, a0);
-                acc11 = vmlaq_f32(acc11, b10, a1);
-                acc21 = vmlaq_f32(acc21, b10, a2);
-                acc31 = vmlaq_f32(acc31, b10, a3);
-
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b01, a4);
-                acc10 = vmlaq_f32(acc10, b01, a5);
-                acc20 = vmlaq_f32(acc20, b01, a6);
-                acc30 = vmlaq_f32(acc30, b01, a7);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b11, a4);
-                acc11 = vmlaq_f32(acc11, b11, a5);
-                acc21 = vmlaq_f32(acc21, b11, a6);
-                acc31 = vmlaq_f32(acc31, b11, a7);
-
-                mtx_a0 += 8;
-                mtx_b0 += 8;
-                mtx_b1 += 8;
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b10, a0);
+                    acc11 = vmlaq_f32(acc11, b10, a1);
+                    acc21 = vmlaq_f32(acc21, b10, a2);
+                    acc31 = vmlaq_f32(acc31, b10, a3);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b11, a4);
+                    acc11 = vmlaq_f32(acc11, b11, a5);
+                    acc21 = vmlaq_f32(acc21, b11, a6);
+                    acc31 = vmlaq_f32(acc31, b11, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+                    mtx_b1 += 8;
+
+                    a0  = vld1q_dup_f32(mtx_a0 + 0);
+                    a1  = vld1q_dup_f32(mtx_a0 + 1);
+                    a2  = vld1q_dup_f32(mtx_a0 + 2);
+                    a3  = vld1q_dup_f32(mtx_a0 + 3);
+                    b00 = vld1q_f32(mtx_b0);
+                    b10 = vld1q_f32(mtx_b1);
+                    b01 = vld1q_f32(mtx_b0 + 4);
+                    b11 = vld1q_f32(mtx_b1 + 4);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b10, a0);
+                    acc11 = vmlaq_f32(acc11, b10, a1);
+                    acc21 = vmlaq_f32(acc21, b10, a2);
+                    acc31 = vmlaq_f32(acc31, b10, a3);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b11, a4);
+                    acc11 = vmlaq_f32(acc11, b11, a5);
+                    acc21 = vmlaq_f32(acc21, b11, a6);
+                    acc31 = vmlaq_f32(acc31, b11, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+                    mtx_b1 += 8;
+                }
+
+                // Only consider one row from matrix b if subsequent row is out of boundary.
+                for (; mtx_b0 < mtx_b0_end_addr;)
+                {
+                    float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+                    float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+                    float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+                    float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+                    float32x4_t b00 = vld1q_f32(mtx_b0);
+                    float32x4_t b10 = vld1q_f32(mtx_b1);
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                    asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                    asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    // 4x4 block 1
+                    acc01 = vmlaq_f32(acc01, b10, a0);
+                    acc11 = vmlaq_f32(acc11, b10, a1);
+                    acc21 = vmlaq_f32(acc21, b10, a2);
+                    acc31 = vmlaq_f32(acc31, b10, a3);
+
+                    mtx_a0 += 4;
+                    mtx_b0 += 4;
+                    mtx_b1 += 4;
+                }
             }
 
-            for (; mtx_b0 < mtx_b0_end_addr;)
+            // Leftover last row in matrix b, in case of there are odd number of rows in matrix B
+            else if (mtx_b0 < end_addr_mtx_b_per_batch[id.z()])
             {
-                float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
-                float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
-                float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
-                float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
-                float32x4_t b00 = vld1q_f32(mtx_b0);
-                float32x4_t b10 = vld1q_f32(mtx_b1);
+                for (; mtx_b0 < (mtx_b0_end_addr - 32);)
+                {
+                    float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+                    float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+                    float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+                    float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                    float32x4_t b00 = vld1q_f32(mtx_b0);
+                    float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
 
 #if __arm__
-                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
 #endif /* __arm__ */
-                // 4x4 block 0
-                acc00 = vmlaq_f32(acc00, b00, a0);
-                acc10 = vmlaq_f32(acc10, b00, a1);
-                acc20 = vmlaq_f32(acc20, b00, a2);
-                acc30 = vmlaq_f32(acc30, b00, a3);
-
-                // 4x4 block 1
-                acc01 = vmlaq_f32(acc01, b10, a0);
-                acc11 = vmlaq_f32(acc11, b10, a1);
-                acc21 = vmlaq_f32(acc21, b10, a2);
-                acc31 = vmlaq_f32(acc31, b10, a3);
-
-                mtx_a0 += 4;
-                mtx_b0 += 4;
-                mtx_b1 += 4;
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+
+                    a0 = vld1q_dup_f32(mtx_a0 + 0);
+                    a1 = vld1q_dup_f32(mtx_a0 + 1);
+                    a2 = vld1q_dup_f32(mtx_a0 + 2);
+                    a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                    b00 = vld1q_f32(mtx_b0);
+                    b01 = vld1q_f32(mtx_b0 + 4);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+
+                    a0  = vld1q_dup_f32(mtx_a0 + 0);
+                    a1  = vld1q_dup_f32(mtx_a0 + 1);
+                    a2  = vld1q_dup_f32(mtx_a0 + 2);
+                    a3  = vld1q_dup_f32(mtx_a0 + 3);
+                    b00 = vld1q_f32(mtx_b0);
+                    b01 = vld1q_f32(mtx_b0 + 4);
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+#endif /* __arm__ */
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+
+                    a0  = vld1q_dup_f32(mtx_a0 + 0);
+                    a1  = vld1q_dup_f32(mtx_a0 + 1);
+                    a2  = vld1q_dup_f32(mtx_a0 + 2);
+                    a3  = vld1q_dup_f32(mtx_a0 + 3);
+                    b00 = vld1q_f32(mtx_b0);
+                    b01 = vld1q_f32(mtx_b0 + 4);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    a4 = vld1q_dup_f32(mtx_a0 + 4);
+                    a5 = vld1q_dup_f32(mtx_a0 + 5);
+                    a6 = vld1q_dup_f32(mtx_a0 + 6);
+                    a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b01, a4);
+                    acc10 = vmlaq_f32(acc10, b01, a5);
+                    acc20 = vmlaq_f32(acc20, b01, a6);
+                    acc30 = vmlaq_f32(acc30, b01, a7);
+
+                    mtx_a0 += 8;
+                    mtx_b0 += 8;
+                }
+                for (; mtx_b0 < mtx_b0_end_addr;)
+                {
+                    float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+                    float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+                    float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+                    float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+                    float32x4_t b00 = vld1q_f32(mtx_b0);
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                    asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+#endif /* __arm__ */
+                    // 4x4 block 0
+                    acc00 = vmlaq_f32(acc00, b00, a0);
+                    acc10 = vmlaq_f32(acc10, b00, a1);
+                    acc20 = vmlaq_f32(acc20, b00, a2);
+                    acc30 = vmlaq_f32(acc30, b00, a3);
+
+                    mtx_a0 += 4;
+                    mtx_b0 += 4;
+                }
             }
 
             // Multiply by the weight of matrix product (alpha)
diff --git a/src/cpu/kernels/logistic/generic/sme2/fp32.cpp b/src/cpu/kernels/logistic/generic/sme2/fp32.cpp
new file mode 100644
index 0000000000..876e466594
--- /dev/null
+++ b/src/cpu/kernels/logistic/generic/sme2/fp32.cpp
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+// This function expects a collapsed 2D shape.
+void sme2_f32_logistic_kernel(const float    *src,
+                              float          *dst,
+                              const uintptr_t shape[2],
+                              const uintptr_t src_strides[2],
+                              const uintptr_t dst_strides[2])
+{
+    // Precondition:
+    assert(src_strides[0] == sizeof(float));
+    assert(dst_strides[0] == sizeof(float));
+    __asm__ volatile(
+        R"(
+            .inst 0xd503477f  // smstart
+
+            ptrue p0.b
+            .inst 0x25207811  // ptrue pn9.b
+
+            // Registers
+            //
+            //   *  x9: temporary, index
+            //   * x10: temporary, inf
+            //   * x11: temporary, 0
+            //   * x12: temporary, 1.0f
+            //   * x13: temporary, body_length
+            //
+            //   * x26: index_1
+            //   * x27: src_1
+            //   * x28: dst_1
+            //
+            //   *  z0: c1
+            //   *  z1: c2
+            //   *  z2: c3
+            //   *  z3: c4
+            //   *  z4: c5
+            //   *  z5: shift
+            //   *  z6: inv_ln2
+            //   *  z7: neg_ln2_hi
+            //   *  z8: neg_ln2_lo
+            //   *  z9: min_input, max_input
+            //   * z10: 23, 0, 1, inf
+            //   * z11: max_value
+            //   * z12-z15: x, r_hi, r, r2
+            //   * z16-z19: max_value, shift, z, scale, poly
+            //   * z20-z21: n, p1, p12345
+            //   * z22-z23: n, p23, p2345
+            //   * z24-z25: p45
+            //   * z26: max_input
+            //   * z28-z31: sum_value
+            //
+            //   * za0-za3: sum_value
+            //
+            //   * p0: all-true
+            //   * p1-p4: underflow,
+            //   * p4: leftover predicate
+            //   * p5-p8: overflow,
+            //   * pn9: all-true
+
+            // TAYLORS CONSTANTS
+            mov  w9, #0xfff6  // c1: 0x1.ffffecp-1f = 0x3f7ffff6
+            mov w10, #0xfedb  // c2: 0x1.fffdb6p-2f = 0x3efffedb
+            mov w11, #0xaf33  // c3: 0x1.555e66p-3f = 0x3e2aaf33
+            mov w12, #0x9f17  // c4: 0x1.573e2ep-5f = 0x3d2b9f17
+            mov w13, #0x2010  // c5: 0x1.0e4020p-7f = 0x3c072010
+
+            movk  w9, #0x3f7f, LSL #16  // c1: 0x1.ffffecp-1f = 0x3f7ffff6
+            movk w10, #0x3eff, LSL #16  // c2: 0x1.fffdb6p-2f = 0x3efffedb
+            movk x11, #0x3e2a, LSL #16  // c3: 0x1.555e66p-3f = 0x3e2aaf33
+            movk w12, #0x3d2b, LSL #16  // c4: 0x1.573e2ep-5f = 0x3d2b9f17
+            movk w13, #0x3c07, LSL #16  // c5: 0x1.0e4020p-7f = 0x3c072010
+
+            dup z0.s, w9   // c1.
+            dup z1.s, w10  // c2.
+            dup z2.s, w11  // c3.
+            dup z3.s, w12  // c4.
+            dup z4.s, w13  // c5.
+
+            mov  w9, #0x007f  // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f
+            mov w10, #0xaa3b  // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b
+            mov w11, #0x7200  // neg_ln2_hi: -ln(2) from bits  -1 to -19 = -0x1.62e400p-1f = 0xbf317200
+            mov w12, #0xbe8e  // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e
+            mov w13, #0x47ae  // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae
+            mov w14, #0xBD71  // max_input 88.37 = 0x42B0BD71
+
+            movk  w9, #0x4b00, LSL #16  // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f
+            movk w10, #0x3fb8, LSL #16  // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b
+            movk w11, #0xbf31, LSL #16  // neg_ln2_hi: -ln(2) from bits  -1 to -19 = -0x1.62e400p-1f = 0xbf317200
+            movk w12, #0xb5bf, LSL #16  // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e
+            movk w13, #0xc2ad, LSL #16  // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae
+            movk w14, #0x42B0, LSL #16  // max_input (88.37) = 0x42B0BD71
+
+            dup z5.s, w9   // shift
+            dup z6.s, w10  // inv_ln2
+            dup z7.s, w11  // neg_ln2_hi
+            dup z8.s, w12  // neg_ln2_lo
+            dup z9.s, w13  // min_input
+            dup z26.s, w14  // max_input
+
+            mov w10, #0x0000  // inf: 0x7F800000
+            movk w10, #0x7F80,  LSL #16 // inf: 0x7F800000
+
+            mov w15, #0x0000
+            movk w15, #0x3F80, LSL #16 // 1
+
+            mov w11, #0  // 0
+
+            // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl
+            cntw x13, ALL, MUL #4 // x13 is vl
+            udiv x9, %x[length], x13 // length/vl
+            mul x13, x13, x9 // x13 = vl * result
+
+            // ==================================================
+            // Outer loop opening
+            // ==================================================
+
+            mov x27, %x[src]  // starting point of pointers for src.
+            mov x28, %x[dst]  // starting point of pointers for dst.
+            mov x26, %x[shape_1]
+
+outer_loop_start%=:
+            // for index_1 in shape_1 downto 1
+            cmp x26, #0
+            b.eq outer_loop_end%=
+            sub x26, x26, #1
+
+            mov x9, #0                                                         // x9: index
+
+inner_body_start%=:
+            cmp x9, x13
+            b.eq inner_body_end%=
+
+            // Loads the input data to 4 consecutive registers ---------------- z12-z15: input_data
+            .inst 0xa009c76c  // ld1w {z12.s-z15.s}, pn9/z, [x27, x9, LSL #2]
+
+            // ---------------------------------------------------------------- z12-z15: x = neg(x)
+            fneg z12.s, p0/m, z12.s
+            fneg z13.s, p0/m, z13.s
+            fneg z14.s, p0/m, z14.s
+            fneg z15.s, p0/m, z15.s
+
+            // ---------------------------------------------------------------- p4-p7: underflow = x < min_input
+            fcmlt p1.s, p0/z, z12.s, z9.s
+            fcmlt p2.s, p0/z, z13.s, z9.s
+            fcmlt p3.s, p0/z, z14.s, z9.s
+            fcmlt p4.s, p0/z, z15.s, z9.s
+
+            // ---------------------------------------------------------------- p4-p7: overflow = x > max_input
+            fcmlt p5.s, p0/z, z26.s, z12.s
+            fcmlt p6.s, p0/z, z26.s, z13.s
+            fcmlt p7.s, p0/z, z26.s, z14.s
+            fcmlt p8.s, p0/z, z26.s, z15.s
+
+            // ---------------------------------------------------------------- z16-z19: shift
+            mov z16.d, z5.d
+            mov z17.d, z5.d
+            mov z18.d, z5.d
+            mov z19.d, z5.d
+
+            // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2
+            fmla z16.s, p0/m, z12.s, z6.s
+            fmla z17.s, p0/m, z13.s, z6.s
+            fmla z18.s, p0/m, z14.s, z6.s
+            fmla z19.s, p0/m, z15.s, z6.s
+
+            // ---------------------------------------------------------------- z20-z23: n = z - shift
+            fsub z20.s, z16.s, z5.s
+            fsub z21.s, z17.s, z5.s
+            fsub z22.s, z18.s, z5.s
+            fsub z23.s, z19.s, z5.s
+
+            // ---------------------------------------------------------------- z12-z15: r_hi = x + n * neg_ln2_hi
+            fmla z12.s, p0/m, z20.s, z7.s
+            fmla z13.s, p0/m, z21.s, z7.s
+            fmla z14.s, p0/m, z22.s, z7.s
+            fmla z15.s, p0/m, z23.s, z7.s
+
+            // ---------------------------------------------------------------- z12-z15: r = r_hi + n * neg_ln2_lo
+            fmla z12.s, p0/m, z20.s, z8.s
+            fmla z13.s, p0/m, z21.s, z8.s
+            fmla z14.s, p0/m, z22.s, z8.s
+            fmla z15.s, p0/m, z23.s, z8.s
+
+            // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n)
+            dup z10.s, #23
+            urshl z16.s, p0/m, z16.s, z10.s
+            urshl z17.s, p0/m, z17.s, z10.s
+            urshl z18.s, p0/m, z18.s, z10.s
+            urshl z19.s, p0/m, z19.s, z10.s
+
+            // Processes the first 2 vectors.
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z12.s, z0.s
+            fmul z21.s, z13.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z12.s, z2.s
+            fmla z23.s, p0/m, z13.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z35: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z12.s, z4.s
+            fmla z25.s, p0/m, z13.s, z4.s
+
+            // ---------------------------------------------------------------- z12-z13: r2 = r * r
+            fmul z12.s, z12.s, z12.s
+            fmul z13.s, z13.s, z13.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z12.s, z24.s
+            fmla z23.s, p0/m, z13.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z12.s, z22.s
+            fmla z21.s, p0/m, z13.s, z23.s
+
+            // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale
+            fmla z16.s, p0/m, z20.s, z16.s
+            fmla z17.s, p0/m, z21.s, z17.s
+
+            // Processes the last 2 vectors
+
+            // ---------------------------------------------------------------- z20-z21: p1 = r * c1
+            fmul z20.s, z14.s, z0.s
+            fmul z21.s, z15.s, z0.s
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2
+            mov z22.d, z1.d
+            mov z23.d, z1.d
+
+            // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3
+            fmla z22.s, p0/m, z14.s, z2.s
+            fmla z23.s, p0/m, z15.s, z2.s
+
+            // ---------------------------------------------------------------- z24-z35: c4
+            mov z24.d, z3.d
+            mov z25.d, z3.d
+
+            // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5
+            fmla z24.s, p0/m, z14.s, z4.s
+            fmla z25.s, p0/m, z15.s, z4.s
+
+            // ---------------------------------------------------------------- z14-z15: r2 = r * r
+            fmul z14.s, z14.s, z14.s
+            fmul z15.s, z15.s, z15.s
+
+            // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45
+            fmla z22.s, p0/m, z14.s, z24.s
+            fmla z23.s, p0/m, z15.s, z25.s
+
+            // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345
+            fmla z20.s, p0/m, z14.s, z22.s
+            fmla z21.s, p0/m, z15.s, z23.s
+
+            // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale
+            fmla z18.s, p0/m, z20.s, z18.s
+            fmla z19.s, p0/m, z21.s, z19.s
+
+            // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly
+            dup z10.s, #0
+            sel z16.s, p1, z10.s, z16.s
+            sel z17.s, p2, z10.s, z17.s
+            sel z18.s, p3, z10.s, z18.s
+            sel z19.s, p4, z10.s, z19.s
+
+            // ---------------------------------------------------------------- z16-z19: poly = overflow ? inf : poly
+            dup z10.s, w10 // z10: inf
+            sel z16.s, p5, z10.s, z16.s
+            sel z17.s, p6, z10.s, z17.s
+            sel z18.s, p7, z10.s, z18.s
+            sel z19.s, p8, z10.s, z19.s
+
+            // 1 / 1 + poly
+            dup z10.s, w15 // z10: 1
+            fadd z16.s, z10.s, z16.s  // poly + 1
+            fadd z17.s, z10.s, z17.s  // poly + 1
+            fadd z18.s, z10.s, z18.s  // poly + 1
+            fadd z19.s, z10.s, z19.s  // poly + 1
+
+            fdivr z16.s, p0/m, z16.s, z10.s                                    // z16: 1/(poly+1)
+            fdivr z17.s, p0/m, z17.s, z10.s                                    // z16: 1/(poly+1)
+            fdivr z18.s, p0/m, z18.s, z10.s                                    // z16: 1/(poly+1)
+            fdivr z19.s, p0/m, z19.s, z10.s                                    // z16: 1/(poly+1)
+
+            // Stores 4 consecutive registers to the output
+            .inst 0xa029c790  // st1w {z16.s-z19.s}, pn9, [x28, x9, LSL #2]
+
+            incw x9, ALL, MUL #4
+            b inner_body_start%=
+inner_body_end%=:
+
+inner_leftover_start%=:
+            // Largely ordinary Sve code to handle taylor series 1/1+e^-x for leftover loop.
+            whilelo p1.s, x9, %x[length]                                       // While x9<length
+            b.none inner_leftover_end%=
+
+            ld1w z12.s, p1/z, [x27, x9, LSL #2]                                // x12: input_data (LOADS POINTERS)
+            fneg  z12.s, p1/m, z12.s
+
+            mov z16.d, z5.d                                                    // z16: shift
+            fcmlt p4.s, p1/z, z12.s, z9.s                                      // p4: underflow = x < min_input
+            fcmlt p5.s, p1/z, z26.s, z12.s                                     // p5: overflow = x > max_input
+
+            fmla z16.s, p1/m, z12.s, z6.s                                      // z16: z = shift + x * inv_ln2
+            fsub z20.s, z16.s, z5.s                                            // z20: n = z - shift
+            fmla z12.s, p1/m, z20.s, z7.s                                      // z12: r_hi = x + n * neg_ln2_hi
+            fmla z12.s, p1/m, z20.s, z8.s                                      // z12: r = r_hi + n * neg_ln2_lo
+            dup z10.s, #23                                                     // z10: 23
+            urshl z16.s, p1/m, z16.s, z10.s                                    // z16: scale = z << 23 (2^n)
+            fmul z20.s, z12.s, z0.s                                            // z20: p1 = r * c1
+            mov z22.d, z1.d                                                    // z22: p23 = c2
+            fmla z22.s, p1/m, z12.s, z2.s                                      // z22: p23 = c2 + r * c3
+            mov z24.d, z3.d                                                    // z24: c4
+            fmla z24.s, p1/m, z12.s, z4.s                                      // z24: p45 = c4 + r * c5
+            fmul z12.s, z12.s, z12.s                                           // z12: r2 = r * r
+            fmla z22.s, p1/m, z12.s, z24.s                                     // z22: p2345 = p23 + r2 * p45
+            fmla z20.s, p1/m, z12.s, z22.s                                     // z20: p12345 = p1 + r2 * p2345
+            fmla z16.s, p1/m, z20.s, z16.s                                     // z16: poly = scale + p12345 * scale
+            dup z10.s, #0                                                      // z10: 0
+            sel z16.s, p4, z10.s, z16.s                                        // z16: poly = underflow ? 0 : poly
+            dup z10.s, w10
+            sel z16.s, p5, z10.s, z16.s                                        // z16: poly = overflow ? inf : poly
+
+            // 1 / 1+poly
+            dup z10.s, w15                                                     // z10: 1
+            fadd z16.s, z10.s, z16.s                                           // z16: z16 + 1
+            fdivr z16.s, p0/m, z16.s, z10.s                                    // z16: 1/(poly+1)
+
+            st1w z16.s, p1, [x28, x9, LSL #2]
+
+            incw x9 // each word + 1
+            b inner_leftover_start%=
+inner_leftover_end%=:
+
+            // ==================================================
+            // Outer loop closing
+            // ==================================================
+
+            add x27, x27, %x[src_stride_1]
+            add x28, x28, %x[dst_stride_1]
+            b outer_loop_start%=
+outer_loop_end%=:
+
+            .inst 0xd503467f  // smstop
+        )"
+        :
+        : [src] "r"(src), [dst] "r"(dst), [shape_1] "r"(shape[1]), [src_stride_1] "r"(src_strides[1]),
+          [dst_stride_1] "r"(dst_strides[1]), [length] "r"(shape[0])
+        : "cc", "memory",                                                      //
+          "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p9",                //
+          "x9", "x10", "x11", "x12", "x13", "x14", "x15",                      //
+          "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", //
+          "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",                //
+          "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",              //
+          "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"               //
+    );
+}
+
+void sme2_fp32_logistic(const ITensor *in, ITensor *out, const ActivationLayerInfo &act_info, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    const auto *src_info = in->info();
+    const auto *dst_info = out->info();
+
+    const auto &src_strides = src_info->strides_in_bytes();
+    const auto &dst_strides = dst_info->strides_in_bytes();
+
+    // Iterator calculates pointer offsets and takes into account padding.
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // NOTE: This kernel uses collapsed 2D shapes.
+    // The excecution window is expected to be pre-collapsed in kernel configure(...) function.
+    const uintptr_t k_shape[] = {window.num_iterations(0), window.num_iterations(1)};
+
+    const uintptr_t k_src_strides[] = {src_strides[0], src_strides[1]};
+    const uintptr_t k_dst_strides[] = {dst_strides[0], dst_strides[1]};
+
+    const auto *k_src = reinterpret_cast<const float *>(input.ptr());
+    auto       *k_dst = reinterpret_cast<float *>(output.ptr());
+
+    sme2_f32_logistic_kernel(k_src, k_dst, k_shape, k_src_strides, k_dst_strides);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/cpu/kernels/logistic/list.h b/src/cpu/kernels/logistic/list.h
new file mode 100644
index 0000000000..7893a61248
--- /dev/null
+++ b/src/cpu/kernels/logistic/list.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_LOGISTIC_LIST_H
+#define ACL_SRC_CPU_KERNELS_LOGISTIC_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_LOGISTIC_KERNEL(func_name) \
+    void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+
+#ifdef __aarch64__
+DECLARE_LOGISTIC_KERNEL(sme2_fp32_logistic);
+#endif // __aarch64__
+
+#undef DECLARE_LOGISTIC_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_LOGISTIC_LIST_H
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
index 344b9df0c8..c73d1def6b 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
@@ -53,23 +53,24 @@ void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, fl
             auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
             auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
 
-            float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
+            float32x4x2_t sum_vec = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
+
             float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
 
             for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
                 float16x8_t data = vld1q_f16(in_ptr + x);
-                sum_vec          = vaddq_f16(sum_vec, data);
                 float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
                 float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
+                sum_vec.val[0]   = vaddq_f32(sum_vec.val[0], dl);
+                sum_vec.val[1]   = vaddq_f32(sum_vec.val[1], dh);
                 sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
                 sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
             }
 
-            float32x4_t sum_carry_res =
-                vpaddq_f32(vcvt_f32_f16(vget_high_f16(sum_vec)), vcvt_f32_f16(vget_low_f16(sum_vec)));
-            float sum    = vaddvq_f32(sum_carry_res);
-            float sum_sq = vaddvq_f32(sum_sq_vec);
+            float32x4_t sum_carry_res = vpaddq_f32(sum_vec.val[0], sum_vec.val[1]);
+            float       sum           = vaddvq_f32(sum_carry_res);
+            float       sum_sq        = vaddvq_f32(sum_sq_vec);
 
             // Compute left-over elements
             for (; x < window_end_x; ++x)
diff --git a/src/cpu/kernels/mul/generic/sme2/list.h b/src/cpu/kernels/mul/generic/sme2/list.h
new file mode 100644
index 0000000000..f6aecfb91c
--- /dev/null
+++ b/src/cpu/kernels/mul/generic/sme2/list.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_MUL_GENERIC_SME2_LIST_H
+#define ACL_SRC_CPU_KERNELS_MUL_GENERIC_SME2_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_MUL_KERNEL(func_name) \
+    void func_name(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+
+DECLARE_MUL_KERNEL(sme2_q8_signed_mul);
+
+#undef DECLARE_MUL_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_MUL_GENERIC_SME2_LIST_H
diff --git a/src/cpu/kernels/mul/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/mul/generic/sme2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..94fbaa1bb2
--- /dev/null
+++ b/src/cpu/kernels/mul/generic/sme2/qasymm8_signed.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+// Mul SME kernel
+void sme2_q8_signed_mul_kernel( //
+    const int8_t   *src,
+    const int8_t   *weights,
+    int8_t         *dst,
+    const int16_t   offset_a,
+    const int16_t   offset_b,
+    const int16_t   offset_c,
+    const float     multiplier, //  = (scale_a * scale_b * mul) / scale_c
+    const uintptr_t win_shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t wei_strides[4],
+    const uintptr_t dst_strides[4])
+{
+    struct Args
+    {
+        uintptr_t     shape1;
+        uintptr_t     shape2;
+        uintptr_t     shape3;
+        const int8_t *src;
+        const int8_t *wei;
+        int8_t       *dst;
+        int           multiplier14p18;
+        int           offsetC14p18;
+        int16_t       offsetA;
+        int16_t       offsetB;
+    } args;
+
+    // Constants used to express values in the 14p18 fixed point format
+    constexpr int32_t two_pwr18i = 262144;
+    constexpr float   two_pwr18f = 262144.f;
+
+    args.shape1          = win_shape[1];
+    args.shape2          = win_shape[2];
+    args.shape3          = win_shape[3];
+    args.src             = src;
+    args.wei             = weights;
+    args.dst             = dst;
+    args.multiplier14p18 = static_cast<int>(multiplier * two_pwr18f);
+    args.offsetC14p18    = static_cast<int>(offset_c * two_pwr18i);
+    // Offsets a/b need to be negated as assembly kernel uses addition instructions where subtraction is needed.
+    // Offset C is not negated as it needs to be added rather than subtracted.
+    args.offsetA = offset_a * -1;
+    args.offsetB = offset_b * -1;
+
+    // Precondition:
+    assert(src_strides[0] == sizeof(int8_t));
+    assert(wei_strides[0] == sizeof(int8_t));
+    assert(dst_strides[0] == sizeof(int8_t));
+    __asm__ volatile(
+        R"(
+            .inst 0xd503477f  // smstart
+            .inst 0x25207811  // ptrue pn9.b
+            ptrue p0.b
+
+            // ==================================================
+            // 3D loop opening
+            // ==================================================
+
+            // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl
+            cntb x8, ALL, MUL #2 // x13 is vl (of 16 bit values)
+            udiv x9, %x[length], x8 // length/vl
+            mul x8, x8, x9 // x13 = vl * result
+
+
+            // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl
+            ldr x10, [%[args_ptr], %[offset_shape_3]]
+            ldr x11, [%[args_ptr], %[offset_src_ptr]]
+            ldr x12, [%[args_ptr], %[offset_wei_ptr]]
+            ldr x13, [%[args_ptr], %[offset_dst_ptr]]
+
+            // Could potentially be replaced with explicit loads.
+            ld1rh {z1.h}, p0/z, [%[args_ptr], %[offset_A_offset]]
+            ld1rh {z2.h}, p0/z, [%[args_ptr], %[offset_B_offset]]
+            ld1rw {z3.s}, p0/z, [%[args_ptr], %[multiplier_offset]]
+
+loop_3_start%=:
+            // for index_3 in shape_3 downto 1
+            cmp x10, #0
+            b.eq loop_3_end%=
+            sub x10, x10, #1
+
+            ldr x14, [%[args_ptr], %[offset_shape_2]]
+            mov x15, x11
+            mov x16, x12
+            mov x17, x13
+
+loop_2_start%=:
+            // for index_2 in shape_2 downto 1
+            cmp x14, #0
+            b.eq loop_2_end%=
+            sub x14, x14, #1
+
+            ldr x7, [%[args_ptr], %[offset_shape_1]]
+            mov x20, x15
+            mov x21, x16
+            mov x22, x17
+
+loop_1_start%=:
+            // for index_1 in shape_2 downto 1
+            cmp x7, #0
+            b.eq loop_1_end%=
+            sub x7, x7, #1
+
+            mov x9, #0                                                         // x9: index/count
+
+inner_loop_body_start%=:
+            cmp x9, x8
+            b.eq inner_loop_body_end%=
+
+            // WIDEN LOAD. LOAD 4 Z-REGS FOR BOTH A/B
+
+            // NOTE: INSTEAD OF LOADING 4 LOAD 2 due to REG LIMITATIONS
+            .inst 0xa0090684 	// ld1b	{z4.b-z5.b}, pn9/z, [x20, x9]
+            .inst 0xa00906a6 	// ld1b	{z6.b-z7.b}, pn9/z, [x21, x9]
+
+            // Widen to 16 bits
+            .inst 0xc175e08c 	// sunpk	{z12.h-z15.h}, {z4.b-z5.b} // (a)
+            .inst 0xc175e0d0 	// sunpk	{z16.h-z19.h}, {z6.b-z7.b} // (b)
+
+            // Apply offset to all registers in 16-bit
+            .inst 0xc161ab0c 	// add	{z12.h-z15.h}, {z12.h-z15.h}, z1.h //a
+            .inst 0xc162ab10 	// add	{z16.h-z19.h}, {z16.h-z19.h}, z2.h //b
+
+            // Widen to 32-bit now.
+            // 12-19 are taken
+            // 4-11 a, 20-27 b
+            .inst 0xc1b5e184 	// sunpk	{z4.s-z7.s}, {z12.h-z13.h}      //a
+            .inst 0xc1b5e1c8 	// sunpk	{z8.s-z11.s}, {z14.h-z15.h}
+            .inst 0xc1b5e214 	// sunpk	{z20.s-z23.s}, {z16.h-z17.h}    //b
+            .inst 0xc1b5e258 	// sunpk	{z24.s-z27.s}, {z18.h-z19.h}
+
+            // Multiply a*b in int32
+            // Output in z4-z11
+            MUL z4.s, z4.s, z20.s
+            MUL z5.s, z5.s, z21.s
+            MUL z6.s, z6.s, z22.s
+            MUL z7.s, z7.s, z23.s
+            MUL z8.s, z8.s, z24.s
+            MUL z9.s, z9.s, z25.s
+            MUL z10.s, z10.s, z26.s
+            MUL z11.s, z11.s, z27.s
+
+            // offsets
+            dup z12.s, %w[offset_C]
+            dup z13.s, %w[offset_C]
+            dup z14.s, %w[offset_C]
+            dup z15.s, %w[offset_C]
+            dup z16.s, %w[offset_C]
+            dup z17.s, %w[offset_C]
+            dup z18.s, %w[offset_C]
+            dup z19.s, %w[offset_C]
+
+            // MLA Fixed Point multiplication integer
+            MLA z12.s, p0/m, z4.s, z3.s
+            MLA z13.s, p0/m, z5.s, z3.s
+            MLA z14.s, p0/m, z6.s, z3.s
+            MLA z15.s, p0/m, z7.s, z3.s
+            MLA z16.s, p0/m, z8.s, z3.s
+            MLA z17.s, p0/m, z9.s, z3.s
+            MLA z18.s, p0/m, z10.s, z3.s
+            MLA z19.s, p0/m, z11.s, z3.s
+
+            // Int32 to Int8 saturate
+            .inst 0xc16eda05 	// sqrshr	z5.b, {z16.s-z19.s}, #18
+            .inst 0xc16ed984 	// sqrshr	z4.b, {z12.s-z15.s}, #18
+            // Store
+            .inst 0xa02906c4 	// st1b	{z4.b-z5.b}, pn9, [x22, x9]
+
+            incb x9, ALL, MUL #2
+            b inner_loop_body_start%=
+inner_loop_body_end%=:
+
+inner_loop_leftover_start%=:
+            whilelo p1.b, x9, %x[length]    // While x9<length
+            b.none inner_loop_leftover_end%=
+
+            // HANDLE MULTIPLICATION HERE
+            ld1b z4.b, p1/z, [x20, x9]                                // z4: a input_data
+            ld1b z5.b, p1/z, [x21, x9]                                // z5: b input_data
+
+            // Widen register z4 (a)
+            sunpklo z6.h, z4.b                                       // lower as 16 bits
+            sunpkhi z7.h, z4.b                                       // upper as 16 bits
+
+            // Widen register z5 (b)
+            sunpklo z8.h, z5.b                                       // lower as 16 bits
+            sunpkhi z9.h, z5.b                                       // upper as 16 bits
+
+            // Apply offset in 16bit maths to all resulting vectors.
+            add z6.h, z6.h, z1.h //a
+            add z7.h, z7.h, z1.h
+            add z8.h, z8.h, z2.h //b
+            add z9.h, z9.h, z2.h
+
+            // Widen a,b to 32-bit z-registers.
+            // Multiply a and b and store result as 32 bit int.
+            // a lower - 32-bit
+            sunpklo z10.s, z6.h
+            sunpkhi z11.s, z6.h
+            // a upper - 32-bit
+            sunpklo z12.s, z7.h
+            sunpkhi z13.s, z7.h
+
+            // b lower - 32-bit
+            sunpklo z14.s, z8.h
+            sunpkhi z15.s, z8.h
+            // b upper - 32-bit
+            sunpklo z16.s, z9.h
+            sunpkhi z17.s, z9.h
+
+            // offsets
+            dup z4.s, %w[offset_C]
+            dup z5.s, %w[offset_C]
+            dup z6.s, %w[offset_C]
+            dup z7.s, %w[offset_C]
+
+            // Multiply a*b (lower) in int32
+            MUL z10.s, z10.s, z14.s
+            MUL z11.s, z11.s, z15.s
+
+            // Multiply a*b (upper) in int32
+            MUL z12.s, z12.s, z16.s
+            MUL z13.s, z13.s, z17.s
+
+            // Still int32 here.
+            // Now MLA in fixed point
+            MLA z4.s, p0/m, z10.s, z3.s
+            MLA z5.s, p0/m, z11.s, z3.s
+            MLA z6.s, p0/m, z12.s, z3.s
+            MLA z7.s, p0/m, z13.s, z3.s
+
+            // Right shift, no narrow
+            LSR z20.s, z4.s, #8
+            LSR z21.s, z5.s, #8
+            LSR z22.s, z6.s, #8
+            LSR z23.s, z7.s, #8
+
+            // Right shift rounding (lower)
+            // Do not saturate.
+            RSHRNB z20.h, z20.s, #8
+            RSHRNB z21.h, z21.s, #8
+            UZP1 z25.h, z20.h, z21.h
+            // Right shift upper.
+            RSHRNB z22.h, z22.s, #8
+            RSHRNB z23.h, z23.s, #8
+            UZP1 z26.h, z22.h, z23.h
+
+            // Shift again to 8 bit both vectors. Recombine.
+            SQRSHRNB z25.b, z25.h, #2
+            SQRSHRNB z26.b, z26.h, #2
+            UZP1 z27.b, z25.b, z26.b
+
+            st1b z27.b, p1, [x22, x9]
+
+            incb x9 // x9 : x9 += sizeof(element) * predicate_count
+            b inner_loop_leftover_start%=
+inner_loop_leftover_end%=:
+
+            // ==================================================
+            // 3D loop closing
+            // ==================================================
+
+            add x20, x20, %[src_stride_1]
+            add x21, x21, %[wei_stride_1]
+            add x22, x22, %[dst_stride_1]
+            b loop_1_start%=
+loop_1_end%=:
+
+            add x15, x15, %[src_stride_2]
+            add x16, x16, %[wei_stride_2]
+            add x17, x17, %[dst_stride_2]
+            b loop_2_start%=
+loop_2_end%=:
+
+            add x11, x11, %[src_stride_3]
+            add x12, x12, %[wei_stride_3]
+            add x13, x13, %[dst_stride_3]
+            b loop_3_start%=
+loop_3_end%=:
+
+            .inst 0xd503467f  // smstop
+        )"
+        :
+        : // The following arguments are loaded via arg ptr values and a constant offset.
+        [args_ptr] "r"(&args), [offset_src_ptr] "I"(offsetof(Args, src)), [offset_wei_ptr] "I"(offsetof(Args, wei)),
+        [offset_dst_ptr] "I"(offsetof(Args, dst)), [offset_shape_1] "I"(offsetof(Args, shape1)),
+        [offset_shape_2] "I"(offsetof(Args, shape2)), [offset_shape_3] "I"(offsetof(Args, shape3)),
+        [multiplier_offset] "I"(offsetof(Args, multiplier14p18)), //
+        [offset_A_offset] "I"(offsetof(Args, offsetA)),           //
+        [offset_B_offset] "I"(offsetof(Args, offsetB)),           //
+        // Use registers for efficiency sake.
+        [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]), [src_stride_3] "r"(src_strides[3]),
+        [wei_stride_1] "r"(wei_strides[1]), [wei_stride_2] "r"(wei_strides[2]), [wei_stride_3] "r"(wei_strides[3]),
+        [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]), [dst_stride_3] "r"(dst_strides[3]),
+        [offset_C] "r"(args.offsetC14p18), //
+        [length] "r"(win_shape[0])
+        : "cc", "memory", //
+          "p0", "p1", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22",
+          "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",         //
+          "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",   //
+          "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", //
+          "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"  //
+    );
+}
+
+void sme2_q8_signed_mul(const ITensor *in0, const ITensor *in1, ITensor *out, const Window &window, const float scale)
+{
+    const auto *src_info  = in0->info();
+    const auto *src2_info = in1->info();
+    const auto *dst_info  = out->info();
+
+    const UniformQuantizationInfo src_q_info  = src_info->quantization_info().uniform();
+    const UniformQuantizationInfo src2_q_info = src2_info->quantization_info().uniform();
+    const UniformQuantizationInfo dst_q_info  = dst_info->quantization_info().uniform();
+
+    const auto &src_strides_bytes = src_info->strides_in_bytes();
+    const auto &wei_strides_bytes = src2_info->strides_in_bytes();
+    const auto &dst_strides_bytes = dst_info->strides_in_bytes();
+
+    // NOTE: This kernel does not support shapes above 4D (Unless excecution window has been collapsed)
+    assert(window.num_iterations(4) == 1 && window.num_iterations(5) == 1);
+
+    // Note : The window is expected to handle y-broadcasting by setting relevant strides to 0.
+    const uintptr_t shape[] = {
+        window.num_iterations(0),
+        window.num_iterations(1),
+        window.num_iterations(2),
+        window.num_iterations(3),
+    };
+
+    Window input1_win = window.broadcast_if_dimension_le_one(src_info->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2_info->tensor_shape());
+
+    // First dim is always datasize. If broadcasting in other dims, set stride to 0.
+    uintptr_t src_strides[] = {src_strides_bytes[0], (input1_win.is_broadcasted(1)) ? 0 : src_strides_bytes[1],
+                               (input1_win.is_broadcasted(2)) ? 0 : src_strides_bytes[2],
+                               (input1_win.is_broadcasted(3)) ? 0 : src_strides_bytes[3]};
+    uintptr_t wei_strides[] = {wei_strides_bytes[0], (input2_win.is_broadcasted(1)) ? 0 : wei_strides_bytes[1],
+                               (input2_win.is_broadcasted(2)) ? 0 : wei_strides_bytes[2],
+                               (input2_win.is_broadcasted(3)) ? 0 : wei_strides_bytes[3]};
+
+    const uintptr_t dst_strides[] = {
+        dst_strides_bytes[0],
+        dst_strides_bytes[1],
+        dst_strides_bytes[2],
+        dst_strides_bytes[3],
+    };
+
+    const uintptr_t src_offset = window[0].start() * src_strides[0] + window[1].start() * src_strides[1] +
+                                 window[2].start() * src_strides[2] + window[3].start() * src_strides[3] +
+                                 in0->info()->offset_first_element_in_bytes();
+    const uintptr_t src2_offset = window[0].start() * wei_strides[0] + window[1].start() * wei_strides[1] +
+                                  window[2].start() * wei_strides[2] + window[3].start() * wei_strides[3] +
+                                  in1->info()->offset_first_element_in_bytes();
+    const uintptr_t dst_offset = window[0].start() * dst_strides[0] + window[1].start() * dst_strides[1] +
+                                 window[2].start() * dst_strides[2] + window[3].start() * dst_strides[3] +
+                                 out->info()->offset_first_element_in_bytes();
+
+    const auto *src  = reinterpret_cast<const int8_t *>(in0->buffer() + src_offset);
+    const auto *src2 = reinterpret_cast<const int8_t *>(in1->buffer() + src2_offset);
+    auto       *dst  = reinterpret_cast<int8_t *>(out->buffer() + dst_offset);
+
+    // Calculate or retrieve necessary offsets/scale values.
+    const int16_t offset_a   = src_q_info.offset;
+    const int16_t offset_b   = src2_q_info.offset;
+    float         multiplier = (src_q_info.scale * src2_q_info.scale * scale) / dst_q_info.scale;
+
+    sme2_q8_signed_mul_kernel(src, src2, dst, offset_a, offset_b, dst_q_info.offset, multiplier, shape, src_strides,
+                              wei_strides, dst_strides);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SME2