14 files changed, 248 insertions, 511 deletions
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index 2e2adf33e0..db8f881712 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -31,21 +31,18 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float16_t>(in, out, window);
+    return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index 61df40c1b5..c281d1bf31 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float>(in, out, window);
+    return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 5d6e6a4f80..487f6ae051 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -29,43 +29,76 @@ namespace arm_compute
 {
 namespace cpu
 {
-template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(
-    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
     static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
                   "quantized type should be either qasymm8_t or qasymm8_signed_t.");
 
-    const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
 
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
+    const float       scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
 
-    Iterator      in_it(in, window);
-    Iterator      max_it(max, window);
-    Iterator      out_it(out, window);
     constexpr int vec_size = 16;
 
+#ifndef __aarch64__
+    const int sum_stages = log2(vec_size >> 1);
+#endif // __aarch64__
+
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+            float   *tmp_ptr = reinterpret_cast<float *>(tmp);
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
 
-            float sum{};
-            float sum_inversed{};
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
+
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // Compute Max
+
+            float sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
 
                 /* Init sum to zero */
@@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized(
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                    vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                    auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+                    auto vec_elements              = wrapper::vloadq(in_ptr + x);
+                    vec_elements                   = wrapper::vqsub(vec_max, vec_elements);
+                    float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
                         vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
@@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized(
                 }
 
                 /* Reduce sum */
-                const auto sum_16_byte =
+                const float32x4_t sum_16_byte =
                     vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+
+                float sum;
+
+#ifdef __aarch64__
+                sum = wrapper::vaddv(sum_16_byte);
+#else  // __aarch64__
                 auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
                 sum_res      = vpadd_f32(sum_res, sum_res);
                 sum          = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     float element{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (max_val - in_ptr[x]) * scale_beta;
                         sum += std::exp(element);
@@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized(
                     tmp_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = 256.f / sum;
+                    sum_transformed = 256.f / sum;
                 }
                 else
                 {
-                    sum = std::log(sum);
+                    sum_transformed = std::log(sum);
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
                 constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+
+                const float32x4_t sum_vec = vdupq_n_f32(sum_transformed);
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
@@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized(
                     using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
                     float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
                     int_vec_type  normalized_value{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         const float32x4x4_t sub = {
-                            vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[0], sum_vec),
+                            vsubq_f32(vec_in.val[1], sum_vec),
+                            vsubq_f32(vec_in.val[2], sum_vec),
+                            vsubq_f32(vec_in.val[3], sum_vec),
                         };
                         normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
                     }
                     else
                     {
                         float32x4x4_t mul = {
-                            vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[0], sum_vec),
+                            vmulq_f32(vec_in.val[1], sum_vec),
+                            vmulq_f32(vec_in.val[2], sum_vec),
+                            vmulq_f32(vec_in.val[3], sum_vec),
                         };
 
                         if (is_qasymm8_signed)
@@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized(
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed);
                     }
                     else
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) -
                                                                    (is_qasymm8_signed ? 128.f : 0));
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
 
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
+template void neon_softmax_quantized<qasymm8_signed_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_signed_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 4d9b789297..60380cd233 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
 
@@ -33,105 +33,100 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{window};
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(
-        win,
-        [&](const Coordinates &)
-        {
-            // Get pointers
-            const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Init max value
-            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-            int  x       = window_start_x;
-
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto current_value = wrapper::vloadq(in_ptr + x);
-                vec_max                  = wrapper::vmax(vec_max, current_value);
-            }
-            auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-            for (int i = 0; i < sum_stages; ++i)
-            {
-                carry_max = wrapper::vpmax(carry_max, carry_max);
-            }
-            T max_val = wrapper::vgetlane(carry_max, 0);
 
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-            }
+#ifdef __aarch64__
+namespace
+{
+// These helper functions are added because vaddv does not exist for fp16,
+// and, therefore, is not part of the wrapper::vaddv interface.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages)
+{
+    auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a));
+    for (int i = 0; i < sum_stages; ++i)
+    {
+        sum_res = wrapper::vpadd(sum_res, sum_res);
+    }
+    return wrapper::vgetlane(sum_res, 0);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-            *out_ptr = max_val;
-        },
-        input, output);
+inline float wrapper_vaddv(const float32x4_t &a, int sum_stages)
+{
+    ARM_COMPUTE_UNUSED(sum_stages);
+    return wrapper::vaddv(a);
 }
+} // namespace
+#endif // __aarch64__
 
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in,
-                                      const ITensor *max,
-                                      void *const    tmp,
-                                      ITensor       *out,
-                                      float          beta,
-                                      bool           is_log,
-                                      const Window  &window);
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in,
-                                  const ITensor *max,
-                                  void *const    tmp,
-                                  ITensor       *out,
-                                  const float    beta,
-                                  bool           is_log,
-                                  const Window  &window)
+// The template implementation for float data types is stored in the header file because
+// we need all fp16 instantiated code to live in fp16.cpp files.
+template <typename T, bool IS_LOG>
+void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
-    const int start_x     = in->info()->valid_region().anchor.x();
+    ARM_COMPUTE_UNUSED(tmp);
+
     const int input_width = in->info()->valid_region().shape.x();
 
     Iterator in_it(in, window);
-    Iterator max_it(max, window);
     Iterator out_it(out, window);
 
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
+    constexpr int vec_size = 16 / sizeof(T);
+
+    const int sum_stages = log2(vec_size >> 1);
+
+    const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
 
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
+
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
 
-            T sum{};
-            T sum_inversed{};
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // compute max
+
+            T sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
 
                 /* Init sum to zero */
@@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                 {
                     auto vec_elements = wrapper::vloadq(in_ptr + x);
                     vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        vec_elements =
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                        vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                        vec_elements = wrapper::vmul(vec_elements, beta_vec);
+                        vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
                     }
                     else
                     {
-                        vec_elements = wrapper::vexpq(
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                        vec_sum = wrapper::vadd(vec_sum, vec_elements);
+                        vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));
+                        vec_sum      = wrapper::vadd(vec_sum, vec_elements);
                     }
-                    wrapper::vstore(tmp_ptr + x, vec_elements);
+                    wrapper::vstore(out_ptr + x, vec_elements);
                 }
 
                 /* Reduce sum */
+                T sum{};
+#ifdef __aarch64__
+                sum = wrapper_vaddv(vec_sum, sum_stages);
+#else  // __aarch64__
                 auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
                 for (int i = 0; i < sum_stages; ++i)
                 {
                     sum_res = wrapper::vpadd(sum_res, sum_res);
                 }
                 sum = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     T element{};
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (in_ptr[x] - max_val) * beta;
                         sum += std::exp(element);
@@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                         element = std::exp((in_ptr[x] - max_val) * beta);
                         sum += element;
                     }
-                    tmp_ptr[x] = element;
+
+                    out_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = T(1) / sum;
+                    sum_transformed = T(1) / sum;
                 }
                 else
                 {
-                    sum = static_cast<T>(std::log(sum));
+                    sum_transformed = static_cast<T>(std::log(sum));
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
+                const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{});
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                    auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                    if (is_log)
+                    const auto vec_in = wrapper::vloadq(out_ptr + x);
+                    if (IS_LOG)
                     {
-                        normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec));
                     }
                     else
                     {
-                        normalized_value =
-                            wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec));
                     }
-                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
+
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = tmp_ptr[x] - sum;
+                        out_ptr[x] = out_ptr[x] - sum_transformed;
                     }
                     else
                     {
-                        out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                        out_ptr[x] = out_ptr[x] * sum_transformed;
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
+
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index 40713dc496..9589ebcd7c 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_t>(in, out, window);
-}
+template void
+neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 2c5e284f54..0bf6b2859a 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_signed_softmax(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
+template void neon_qasymm8_signed_softmax<true>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void neon_qasymm8_signed_softmax<false>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
deleted file mode 100644
index 5e94f72faf..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2021-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/CpuTypes.h"
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp16_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
-{
-    return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<float16_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
deleted file mode 100644
index d692cc2477..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp32_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
-{
-    return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<float>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 24f1bb8143..0d4b7f4509 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
 {
@@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in,
         },
         in_it, max_it, out_it);
 }
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in,
-                                                 const ITensor *max,
-                                                 void *const    tmp,
-                                                 ITensor       *out,
-                                                 const float    beta,
-                                                 bool           is_log,
-                                                 const Window  &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
-                                                     const ITensor *max,
-                                                     void *const    tmp,
-                                                     ITensor       *out,
-                                                     const float    beta,
-                                                     bool           is_log,
-                                                     const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
deleted file mode 100644
index 85e5ccfea1..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
deleted file mode 100644
index 4be2e2eed6..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 98b2f5117f..a8fb1d4adf 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE2 code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve2_softmax_logits_1d_quantized(
     const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
@@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized(
         },
         in_it, max_it, out_it);
 }
-
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
deleted file mode 100644
index 95623786b3..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
deleted file mode 100644
index c20462fcef..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute