27 files changed, 631 insertions, 1061 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 13f4e9ea2a..ac4f0610ea 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -46,6 +46,8 @@ v24.01 Public major release
   You should link only to the main `libarm_compute` library for core functionality.
  - New features
    - Add support for FP16 in all multi_isa builds.
+ - Performance optimizations:
+   - Optimize @ref NESoftmaxLayer
 
 v23.11 Public major release
  - New features
@@ -438,8 +440,8 @@ v21.02 Public major release
    - @ref NEActivationLayer
    - @ref NEArithmeticAddition
    - @ref NEBatchNormalizationLayerKernel
-   - @ref cpu::kernels::CpuLogits1DSoftmaxKernel
-   - @ref cpu::kernels::CpuLogits1DMaxKernel
+   - cpu::kernels::CpuLogits1DSoftmaxKernel
+   - cpu::kernels::CpuLogits1DMaxKernel
    - @ref cpu::kernels::CpuElementwiseUnaryKernel
  - Remove padding from OpenCL kernels:
    - CLDirectConvolutionLayerKernel
diff --git a/filelist.json b/filelist.json
index c34eff2ff9..60f4285a03 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2213,16 +2213,10 @@
             "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
           },
           "sve": {
-            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ],
-            "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"],
-            "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"],
-            "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ],
-            "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]
+            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ]
           },
           "sve2":{
-            "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
-            "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"],
-            "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]
+            "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"]
           }
         }
       },
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f281b6a4d5..c14e10c836 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -117,9 +117,7 @@ filegroup(
 	"cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp",
 	"cpu/kernels/elementwise_unary/generic/sve2/q8.cpp",
 	"cpu/kernels/lut/generic/sve2/u8.cpp",
-	"cpu/kernels/softmax/generic/sve2/impl.cpp",
-	"cpu/kernels/softmax/generic/sve2/qasymm8.cpp",
-	"cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]  +
+	"cpu/kernels/softmax/generic/sve2/impl.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
@@ -342,11 +340,7 @@ filegroup(
 	"cpu/kernels/scale/sve/integer.cpp",
 	"cpu/kernels/scale/sve/qasymm8.cpp",
 	"cpu/kernels/scale/sve/qasymm8_signed.cpp",
-	"cpu/kernels/softmax/generic/sve/fp16.cpp",
-	"cpu/kernels/softmax/generic/sve/fp32.cpp",
-	"cpu/kernels/softmax/generic/sve/impl.cpp",
-	"cpu/kernels/softmax/generic/sve/qasymm8.cpp",
-	"cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]  +
+	"cpu/kernels/softmax/generic/sve/impl.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3229ffa8c2..e6c6782da1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -317,11 +317,7 @@ target_sources(
 	cpu/kernels/scale/sve/integer.cpp
 	cpu/kernels/scale/sve/qasymm8.cpp
 	cpu/kernels/scale/sve/qasymm8_signed.cpp
-	cpu/kernels/softmax/generic/sve/fp16.cpp
-	cpu/kernels/softmax/generic/sve/fp32.cpp
 	cpu/kernels/softmax/generic/sve/impl.cpp
-	cpu/kernels/softmax/generic/sve/qasymm8.cpp
-	cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
 )
 
 target_sources(
@@ -339,8 +335,6 @@ target_sources(
 	cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
 	cpu/kernels/lut/generic/sve2/u8.cpp
 	cpu/kernels/softmax/generic/sve2/impl.cpp
-	cpu/kernels/softmax/generic/sve2/qasymm8.cpp
-	cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
 )
 
 target_sources(
diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
index cec437d171..32d38a856c 100644
--- a/src/core/NEON/wrapper/intrinsics/max.h
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
+#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
 
 #include <arm_neon.h>
 
@@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMAX_IMPL
+
+#if defined(__aarch64__)
+// VMAXV: Across vector max
+#define VMAXV_IMPL(stype, vtype, prefix, postfix) \
+    inline stype vmaxv(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8)
+VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8)
+VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16)
+VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16)
+VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32)
+VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32)
+VMAXV_IMPL(float, float32x2_t, vmaxv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8)
+VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8)
+VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16)
+VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16)
+VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32)
+VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32)
+VMAXV_IMPL(float, float32x4_t, vmaxvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMAXV_IMPL
+#endif // defined(__aarch64__)
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index b7daa4d583..45ebeec394 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
-#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
+#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
 
 #include "arm_compute/core/Types.h"
 
@@ -99,6 +99,13 @@ struct ScaleKernelDataTypeISASelectorData
     InterpolationPolicy interpolation_policy;
 };
 
+struct SoftmaxKernelDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+    bool                is_log;
+};
+
 // Selector pointer types
 using DataTypeISASelectorPtr            = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
 using DataTypeDataLayoutSelectorPtr     = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
@@ -113,9 +120,10 @@ using CpuAddKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
 using ScaleKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
-
+using SoftmaxKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type;
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index ce144351f8..486f55e2c1 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,9 +34,12 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/softmax/list.h"
 
+#include <vector>
+
 namespace arm_compute
 {
 namespace cpu
@@ -45,136 +48,40 @@ namespace kernels
 {
 namespace
 {
-/* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
-    {"sve_fp32_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_logits)},
-    {"sve_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_logits)},
-    {"sve_qu8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
-     REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
-    {"sve_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
-     REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
-    {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_logits)},
-    {"neon_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_logits)},
-    {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
-    {"neon_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
+/* Softmax */
+static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+    {"neon_fp32_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<false>)},
+    {"neon_fp16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<false>)},
+    {"neon_qu8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)},
+    {"neon_qs8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)},
+    {"neon_fp32_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<true>)},
+    {"neon_fp16_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<true>)},
+    {"neon_qu8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)},
+    {"neon_qs8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
 };
 
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if (output.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
-                                                           TensorShape(input.tensor_shape()).set(0, 1));
-    }
-
-    return Status{};
-}
-} //namespace
-const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels()
-{
-    return available_kernels_max_logits;
-}
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
-    // Softmax across the x dimension
-    const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
-    Window win = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
-    return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
-    return _name.c_str();
-}
-
-/* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
-template <bool IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
-    {"sve2_qu8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
-     REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
-    {"sve2_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
-     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
-    {"sve_fp32_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_softmax)},
-    {"sve_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_softmax)},
-
-    {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_softmax)},
-    {"neon_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_softmax)},
-    {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
-    {"neon_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
-};
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src,
-                                         const ITensorInfo &max,
-                                         const ITensorInfo &dst,
-                                         const float        beta,
-                                         const ITensorInfo &tmp,
-                                         bool               is_log)
+Status validate_arguments_softmax(
+    const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log)
 {
     ARM_COMPUTE_UNUSED(beta);
     // Check input
@@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
-    // Check max
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
     // Check output if configured
     if (dst.total_size() != 0)
     {
@@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
     // Check tmp if configured
     if (tmp.total_size() != 0)
     {
-        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
-        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
+        // We have temporary storage only if src data type is quantized.
+        // Therefore, tmp data type must be F32
+        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric);
+
         // We could potentially reduce tmp memory if we could predict or make an assumption
         // on the maximum number of threads that will run in parallel.
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
@@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 }
 } // namespace
 
-template <bool IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
-CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels()
 {
-    return available_kernels_logits<IS_LOG>;
+    return available_kernels;
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
-    const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     // Configure kernel window
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
 
     // Output auto initialization if not yet initialized
     const QuantizationInfo output_quantization =
-        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log)
                                 : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
 
-    // Tmp auto initialization if not yet initialized
-    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
-    auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    // Tmp auto initialization if not yet initialized and src is quantized
+    if (is_quantized_asymmetric)
+    {
+        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
+        auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    }
 
-    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
-        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    const auto *uk = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    std::string kernel_name =
-        IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+    std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
 
     _beta       = beta;
     _run_method = uk->ukernel;
     _name       = kernel_name.append("/").append(uk->name);
 
-    // Configure kernel window
-    Window win = calculate_max_window(*max, Steps());
+    Window win = calculate_max_window(*dst, Steps());
+
+    /// TODO: Check dimensions > 0 for holes only. For this, we need
+    /// a utility function checking if there are holes after some dimension.
+    if (!has_holes(*dst, dst->num_dimensions() - 1))
+    {
+        win = win.collapse(win, Window::DimY);
+    }
 
-    ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::configure(win);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis
+
+    ICpuKernel<CpuSoftmaxKernel>::configure(win);
 }
 
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
-    const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuSoftmaxKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     return Status{};
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
 
-    const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
+
+        const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
+        const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
-    ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
+        ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
-    void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-    _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
+        void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
+        _run_method(src, tmp_for_thread, dst, _beta, window);
+    }
+    else
+    {
+        _run_method(src, nullptr, dst, _beta, window);
+    }
 }
 
-template <bool IS_LOG>
-const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuSoftmaxKernel::name() const
 {
     return _name.c_str();
 }
 
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 5d288179fd..3db1f3d0ef 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
@@ -33,102 +33,55 @@ namespace cpu
 {
 namespace kernels
 {
-/** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel>
+/** Interface for softmax computation */
+class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel>
 {
 private:
-    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+    using SoftmaxKernelPtr =
+        std::add_pointer<void(const ITensor *, void *const, ITensor *, float, const Window &)>::type;
 
 public:
-    CpuLogits1DMaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst Destination tensor info. Data types supported: same as @p input
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuLogits1DMaxKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-    struct SoftmaxLogits1DMaxKernel
-    {
-        const char                  *name;
-        const DataTypeISASelectorPtr is_selected;
-        SoftmaxLogits1DMaxKernelPtr  ukernel;
-    };
-
-    static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
-
-private:
-    SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
-    std::string                 _name{};
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
-{
-private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
-        const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-public:
-    CpuLogits1DSoftmaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
+    CpuSoftmaxKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel);
 
     /** Set the input and output tensors.
      *
-     * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  max  Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                  Data types supported: same as @p input.
-     * @param[out] dst  Destination tensor info. Data types supported: same as @p input.
-     * @param[in]  beta A scaling factor for the exponent.
+     * @param[in]  src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst    Destination tensor info. Data types supported: same as @p input.
+     * @param[in]  beta   A scaling factor for the exponent.
+     * @param[in]  is_log True if the operation is log-softmax
      *
      * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
      */
-    void
-    configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp);
     /** Static function to check if given info will lead to a valid configuration
      *
-     * Similar to CpuLogits1DSoftmaxKernel::configure()
+     * Similar to CpuSoftmaxKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src,
-                           const ITensorInfo *max,
-                           const ITensorInfo *dst,
-                           const float        beta,
-                           const ITensorInfo *tmp);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-    struct SoftmaxLogits1DKernel
+    struct SoftmaxKernel
     {
-        const char                  *name;
-        const DataTypeISASelectorPtr is_selected;
-        SoftmaxLogits1DKernelPtr     ukernel;
+        const char                                   *name;
+        const SoftmaxKernelDataTypeISASelectorDataPtr is_selected;
+        SoftmaxKernelPtr                              ukernel;
     };
 
-    static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
+    static const std::vector<SoftmaxKernel> &get_available_kernels();
 
 private:
-    float                    _beta{1.0f};
-    SoftmaxLogits1DKernelPtr _run_method{nullptr};
-    std::string              _name{};
+    float            _beta{1.0f};
+    SoftmaxKernelPtr _run_method{nullptr};
+    std::string      _name{};
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index 2e2adf33e0..db8f881712 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -31,21 +31,18 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float16_t>(in, out, window);
+    return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index 61df40c1b5..c281d1bf31 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float>(in, out, window);
+    return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 5d6e6a4f80..487f6ae051 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -29,43 +29,76 @@ namespace arm_compute
 {
 namespace cpu
 {
-template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(
-    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
     static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
                   "quantized type should be either qasymm8_t or qasymm8_signed_t.");
 
-    const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
 
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
+    const float       scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
 
-    Iterator      in_it(in, window);
-    Iterator      max_it(max, window);
-    Iterator      out_it(out, window);
     constexpr int vec_size = 16;
 
+#ifndef __aarch64__
+    const int sum_stages = log2(vec_size >> 1);
+#endif // __aarch64__
+
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+            float   *tmp_ptr = reinterpret_cast<float *>(tmp);
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
 
-            float sum{};
-            float sum_inversed{};
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
+
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // Compute Max
+
+            float sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
 
                 /* Init sum to zero */
@@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized(
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                    vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                    auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+                    auto vec_elements              = wrapper::vloadq(in_ptr + x);
+                    vec_elements                   = wrapper::vqsub(vec_max, vec_elements);
+                    float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
                         vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
@@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized(
                 }
 
                 /* Reduce sum */
-                const auto sum_16_byte =
+                const float32x4_t sum_16_byte =
                     vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+
+                float sum;
+
+#ifdef __aarch64__
+                sum = wrapper::vaddv(sum_16_byte);
+#else  // __aarch64__
                 auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
                 sum_res      = vpadd_f32(sum_res, sum_res);
                 sum          = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     float element{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (max_val - in_ptr[x]) * scale_beta;
                         sum += std::exp(element);
@@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized(
                     tmp_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = 256.f / sum;
+                    sum_transformed = 256.f / sum;
                 }
                 else
                 {
-                    sum = std::log(sum);
+                    sum_transformed = std::log(sum);
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
                 constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+
+                const float32x4_t sum_vec = vdupq_n_f32(sum_transformed);
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
@@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized(
                     using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
                     float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
                     int_vec_type  normalized_value{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         const float32x4x4_t sub = {
-                            vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[0], sum_vec),
+                            vsubq_f32(vec_in.val[1], sum_vec),
+                            vsubq_f32(vec_in.val[2], sum_vec),
+                            vsubq_f32(vec_in.val[3], sum_vec),
                         };
                         normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
                     }
                     else
                     {
                         float32x4x4_t mul = {
-                            vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[0], sum_vec),
+                            vmulq_f32(vec_in.val[1], sum_vec),
+                            vmulq_f32(vec_in.val[2], sum_vec),
+                            vmulq_f32(vec_in.val[3], sum_vec),
                         };
 
                         if (is_qasymm8_signed)
@@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized(
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed);
                     }
                     else
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) -
                                                                    (is_qasymm8_signed ? 128.f : 0));
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
 
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
+template void neon_softmax_quantized<qasymm8_signed_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_signed_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 4d9b789297..60380cd233 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
 
@@ -33,105 +33,100 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{window};
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(
-        win,
-        [&](const Coordinates &)
-        {
-            // Get pointers
-            const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Init max value
-            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-            int  x       = window_start_x;
-
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto current_value = wrapper::vloadq(in_ptr + x);
-                vec_max                  = wrapper::vmax(vec_max, current_value);
-            }
-            auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-            for (int i = 0; i < sum_stages; ++i)
-            {
-                carry_max = wrapper::vpmax(carry_max, carry_max);
-            }
-            T max_val = wrapper::vgetlane(carry_max, 0);
 
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-            }
+#ifdef __aarch64__
+namespace
+{
+// These helper functions are added because vaddv does not exist for fp16,
+// and, therefore, is not part of the wrapper::vaddv interface.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages)
+{
+    auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a));
+    for (int i = 0; i < sum_stages; ++i)
+    {
+        sum_res = wrapper::vpadd(sum_res, sum_res);
+    }
+    return wrapper::vgetlane(sum_res, 0);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-            *out_ptr = max_val;
-        },
-        input, output);
+inline float wrapper_vaddv(const float32x4_t &a, int sum_stages)
+{
+    ARM_COMPUTE_UNUSED(sum_stages);
+    return wrapper::vaddv(a);
 }
+} // namespace
+#endif // __aarch64__
 
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in,
-                                      const ITensor *max,
-                                      void *const    tmp,
-                                      ITensor       *out,
-                                      float          beta,
-                                      bool           is_log,
-                                      const Window  &window);
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in,
-                                  const ITensor *max,
-                                  void *const    tmp,
-                                  ITensor       *out,
-                                  const float    beta,
-                                  bool           is_log,
-                                  const Window  &window)
+// The template implementation for float data types is stored in the header file because
+// we need all fp16 instantiated code to live in fp16.cpp files.
+template <typename T, bool IS_LOG>
+void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
-    const int start_x     = in->info()->valid_region().anchor.x();
+    ARM_COMPUTE_UNUSED(tmp);
+
     const int input_width = in->info()->valid_region().shape.x();
 
     Iterator in_it(in, window);
-    Iterator max_it(max, window);
     Iterator out_it(out, window);
 
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
+    constexpr int vec_size = 16 / sizeof(T);
+
+    const int sum_stages = log2(vec_size >> 1);
+
+    const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
 
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
+
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
 
-            T sum{};
-            T sum_inversed{};
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // compute max
+
+            T sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
 
                 /* Init sum to zero */
@@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                 {
                     auto vec_elements = wrapper::vloadq(in_ptr + x);
                     vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        vec_elements =
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                        vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                        vec_elements = wrapper::vmul(vec_elements, beta_vec);
+                        vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
                     }
                     else
                     {
-                        vec_elements = wrapper::vexpq(
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                        vec_sum = wrapper::vadd(vec_sum, vec_elements);
+                        vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));
+                        vec_sum      = wrapper::vadd(vec_sum, vec_elements);
                     }
-                    wrapper::vstore(tmp_ptr + x, vec_elements);
+                    wrapper::vstore(out_ptr + x, vec_elements);
                 }
 
                 /* Reduce sum */
+                T sum{};
+#ifdef __aarch64__
+                sum = wrapper_vaddv(vec_sum, sum_stages);
+#else  // __aarch64__
                 auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
                 for (int i = 0; i < sum_stages; ++i)
                 {
                     sum_res = wrapper::vpadd(sum_res, sum_res);
                 }
                 sum = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     T element{};
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (in_ptr[x] - max_val) * beta;
                         sum += std::exp(element);
@@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                         element = std::exp((in_ptr[x] - max_val) * beta);
                         sum += element;
                     }
-                    tmp_ptr[x] = element;
+
+                    out_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = T(1) / sum;
+                    sum_transformed = T(1) / sum;
                 }
                 else
                 {
-                    sum = static_cast<T>(std::log(sum));
+                    sum_transformed = static_cast<T>(std::log(sum));
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
+                const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{});
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                    auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                    if (is_log)
+                    const auto vec_in = wrapper::vloadq(out_ptr + x);
+                    if (IS_LOG)
                     {
-                        normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec));
                     }
                     else
                     {
-                        normalized_value =
-                            wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec));
                     }
-                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
+
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = tmp_ptr[x] - sum;
+                        out_ptr[x] = out_ptr[x] - sum_transformed;
                     }
                     else
                     {
-                        out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                        out_ptr[x] = out_ptr[x] * sum_transformed;
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
+
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index 40713dc496..9589ebcd7c 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_t>(in, out, window);
-}
+template void
+neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 2c5e284f54..0bf6b2859a 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_signed_softmax(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
+template void neon_qasymm8_signed_softmax<true>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void neon_qasymm8_signed_softmax<false>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
deleted file mode 100644
index 5e94f72faf..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2021-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/CpuTypes.h"
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp16_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
-{
-    return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<float16_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
deleted file mode 100644
index d692cc2477..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp32_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
-{
-    return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<float>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 24f1bb8143..0d4b7f4509 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
 {
@@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in,
         },
         in_it, max_it, out_it);
 }
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in,
-                                                 const ITensor *max,
-                                                 void *const    tmp,
-                                                 ITensor       *out,
-                                                 const float    beta,
-                                                 bool           is_log,
-                                                 const Window  &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
-                                                     const ITensor *max,
-                                                     void *const    tmp,
-                                                     ITensor       *out,
-                                                     const float    beta,
-                                                     bool           is_log,
-                                                     const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
deleted file mode 100644
index 85e5ccfea1..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
deleted file mode 100644
index 4be2e2eed6..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 98b2f5117f..a8fb1d4adf 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE2 code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve2_softmax_logits_1d_quantized(
     const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
@@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized(
         },
         in_it, max_it, out_it);
 }
-
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
deleted file mode 100644
index 95623786b3..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
deleted file mode 100644
index c20462fcef..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index 627ce0c264..c143f6659d 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,41 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SOFTMAX_KERNEL(func_name)                                                                  \
-    void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
-                   bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name) \
+    template <bool IS_LOG>                \
+    void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 
 DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
 
 #undef DECLARE_SOFTMAX_KERNEL
-
-#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
-
-DECLARE_LOGITS_KERNEL(neon_fp32_logits);
-DECLARE_LOGITS_KERNEL(neon_fp16_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits);
-DECLARE_LOGITS_KERNEL(sve_fp32_logits);
-DECLARE_LOGITS_KERNEL(sve_fp16_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits);
-
-#undef DECLARE_LOGITS_KERNEL
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index e55d7f903e..ae14381ad9 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,13 +41,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <bool IS_LOG>
-CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
+CpuSoftmaxGeneric::CpuSoftmaxGeneric()
     : _permute_input(),
       _permute_output(),
-      _max_kernel(),
       _softmax_kernel(),
-      _max(),
       _tmp(),
       _input_permuted(),
       _output_permuted(),
@@ -56,8 +53,7 @@ CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
 {
 }
 
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis)
+void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -79,29 +75,23 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     // or it is the original input case (2D case)
     const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src);
 
-    // Create intermediate tensors shapes
-    TensorShape max_sum_shape = tmp_input->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type =
-        is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
-    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+    TensorInfo tensor_info_tmp;
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        // Create intermediate tensors shapes
+        const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+        tensor_info_tmp             = input_info.clone()->set_data_type(DataType::F32);
+    }
 
     // Init intermediate tensors
-    _max = TensorInfo(max_info);
     _tmp = TensorInfo(tensor_info_tmp);
 
     // Configure kernels
-    auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>();
-    mk->configure(tmp_input, &_max);
-    _max_kernel = std::move(mk);
-
-    auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
+    auto sm = std::make_unique<kernels::CpuSoftmaxKernel>();
     if (_needs_permute)
     {
         // The normalization kernel stores the result in a permuted output tensor
-        sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+        sm->configure(tmp_input, &_output_permuted, beta, is_log, &_tmp);
 
         // Re-permute the permuted output into the requested (4D) output
         _permute_output.configure(&_output_permuted, dst,
@@ -110,14 +100,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     else
     {
         // Softmax 2D case
-        sm->configure(tmp_input, &_max, dst, beta, &_tmp);
+        sm->configure(tmp_input, dst, beta, is_log, &_tmp);
     }
     _softmax_kernel = std::move(sm);
 
-    _aux_mem[InternalTensorIdx::MAX] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
-    _aux_mem[InternalTensorIdx::TMP] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    if (_tmp.total_size() > 0)
+    {
+        _aux_mem[InternalTensorIdx::TMP] =
+            MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    }
 
     _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
                                                            MemoryLifetime::Temporary, _input_permuted.total_size());
@@ -125,8 +116,8 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
                                                            MemoryLifetime::Temporary, _output_permuted.total_size());
 }
 
-template <bool IS_LOG>
-Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis)
+Status
+CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
@@ -136,17 +127,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
                                 static_cast<int32_t>(src->num_dimensions()) <= axis);
 
     // Create intermediate tensor info
-    DataType         tmp_data_type = src->data_type();
-    const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = src->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(src->clone()
-                                             ->set_tensor_shape(max_sum_shape)
-                                             .set_data_type(tmp_data_type)
-                                             .set_quantization_info(src->quantization_info())
-                                             .set_is_resizable(true));
-    const TensorInfo dont_care;
+    TensorInfo tensor_info_tmp;
+
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true);
+    }
 
     const unsigned int actual_axis =
         static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
@@ -165,15 +151,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
-        &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuSoftmaxKernel::validate(src, dst, beta, is_log, &tensor_info_tmp));
 
     return Status{};
 }
 
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
+void CpuSoftmaxGeneric::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
 
@@ -181,13 +164,11 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     auto dst = tensors.get_tensor(TensorType::ACL_DST);
 
     CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true);
-    CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
 
     CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
     CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
                                         true);
 
-    ITensorPack max_pack;
     ITensorPack softmax_pack;
 
     if (_needs_permute)
@@ -195,24 +176,15 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
         ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
         _permute_input.run(permute_in_pack);
 
-        max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
-
         softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
-                        {TensorType::ACL_SRC_1, max.get()},
                         {TensorType::ACL_DST_0, output_permuted.get()},
                         {TensorType::ACL_DST_1, tmp.get()}};
     }
     else
     {
-        max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
-
-        softmax_pack = {{TensorType::ACL_SRC_0, src},
-                        {TensorType::ACL_SRC_1, max.get()},
-                        {TensorType::ACL_DST_0, dst},
-                        {TensorType::ACL_DST_1, tmp.get()}};
+        softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}};
     }
 
-    NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
     NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
 
     if (_needs_permute)
@@ -224,13 +196,10 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     }
 }
 
-template <bool IS_LOG>
-experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
+experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const
 {
     return _aux_mem;
 }
 
-template class CpuSoftmaxGeneric<false>;
-template class CpuSoftmaxGeneric<true>;
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 8cab70e14f..47020e9b7c 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
-#define ARM_COMPUTE_CPU_SOFTMAX_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
+#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -37,9 +37,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-class CpuLogits1DMaxKernel;
-template <bool IS_LOG>
-class CpuLogits1DSoftmaxKernel;
+class CpuSoftmaxKernel;
 
 /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
  *
@@ -52,31 +50,31 @@ class CpuLogits1DSoftmaxKernel;
  * This function runs the following function/kernels:
  * -# If axis is not 0:
  * -# @ref CpuPermute
- * -# @ref kernels::CpuLogits1DMaxKernel
- * -# @ref kernels::CpuLogits1DSoftmaxKernel
+ * -# @ref kernels::CpuSoftmaxKernel
  */
-template <bool IS_LOG = false>
 class CpuSoftmaxGeneric : public ICpuOperator
 {
 public:
     CpuSoftmaxGeneric();
     /** Set the input and output tensors.
      *
-     * @param[in,out] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     *                     last value of each row to the nearest multiple.
-     * @param[out]    dst  Destination tensor ifo. Data types supported: same as @p input.
-     * @param[in]     beta (Optional) A scaling factor for the exponent.
-     * @param[in]     axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     * @param[in,out] src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     *                       last value of each row to the nearest multiple.
+     * @param[out]    dst    Destination tensor ifo. Data types supported: same as @p input.
+     * @param[in]     beta   (Optional) A scaling factor for the exponent.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
      *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+     * @param[in]     is_log True if the operation is log-softmax
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuSoftmaxGeneric::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
 
     // Inherited methods overridden:
     void                             run(ITensorPack &tensors) override;
@@ -85,8 +83,7 @@ public:
 private:
     enum InternalTensorIdx
     {
-        MAX = 0,
-        TMP,
+        TMP = 0,
         PERMUTED_SRC,
         PERMUTED_DST,
         COUNT
@@ -94,10 +91,8 @@ private:
 
     CpuPermute                  _permute_input;
     CpuPermute                  _permute_output;
-    std::unique_ptr<ICPPKernel> _max_kernel;
     std::unique_ptr<ICPPKernel> _softmax_kernel;
 
-    TensorInfo _max;
     TensorInfo _tmp;
     TensorInfo _input_permuted;
     TensorInfo _output_permuted;
@@ -105,9 +100,7 @@ private:
     bool                             _needs_permute;
     experimental::MemoryRequirements _aux_mem{};
 };
-using CpuSoftmax    = CpuSoftmaxGeneric<false>;
-using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
 
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index e3c2012d05..be588c5b52 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
 
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "src/cpu/operators/CpuSoftmax.h"
 
 namespace arm_compute
@@ -37,13 +36,12 @@ namespace arm_compute
 template <bool IS_LOG>
 struct NESoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ITensor                                  *src{nullptr};
-    ITensor                                        *dst{nullptr};
-    Tensor                                          max{nullptr};
-    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr};
-    MemoryGroup                                     memory_group{};
-    ITensorPack                                     run_pack{};
-    WorkspaceData<Tensor>                           workspace_tensors{};
+    const ITensor                          *src{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr};
+    MemoryGroup                             memory_group{};
+    ITensorPack                             run_pack{};
+    WorkspaceData<Tensor>                   workspace_tensors{};
 };
 
 template <bool IS_LOG>
@@ -67,8 +65,8 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
 
     _impl->src = input;
     _impl->dst = output;
-    _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
-    _impl->op->configure(input->info(), output->info(), beta, axis);
+    _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric>();
+    _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG);
 
     _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
@@ -79,7 +77,7 @@ Status
 NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG));
     return Status{};
 }
 
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index b372bdf3fa..2397d81547 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -22,14 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -42,6 +40,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -53,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -66,53 +65,53 @@ TEST_SUITE(NEON)
 TEST_SUITE(SoftmaxLayer)
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", { 0,
-                                                  0,
-                                                  0,
-                                                  1,
-                                                  0,
-                                                  -1,
-                                                  2,
-                                                  -3,
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
+                                    QuantizationInfo(1.f/256, 12)),
+                        }),
+    make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        }),
+    make("beta", { 1.0,
+                   2.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                }),
+    make("axis", { 0,
+                   0,
+                   0,
+                   1,
+                   0,
+                   -1,
+                   2,
+                   -3,
+                }),
+    make("Expected", { false, false, false, true, true, true, false, false })),
+    input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
 }
@@ -122,54 +121,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
-DATA_TEST_CASE(KernelSelection_max_logits, framework::DatasetMode::ALL, concat(
-                   combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                           framework::dataset::make("DataType", { DataType::F32,
-                                                                  DataType::F16,
-                                                                  DataType::QASYMM8,
-                                                                  DataType::QASYMM8_SIGNED
-                                                                })),
-                   combine(framework::dataset::make("CpuExt", std::string("SVE")),
-                           framework::dataset::make("DataType", { DataType::F32,
-                                                                  DataType::F16,
-                                                                  DataType::QASYMM8,
-                                                                  DataType::QASYMM8_SIGNED
-                                                                }))),
-               cpu_ext, data_type)
-{
-    using namespace cpu::kernels;
-
-    cpuinfo::CpuIsaInfo cpu_isa{};
-    cpu_isa.neon = (cpu_ext == "NEON");
-    cpu_isa.sve  = (cpu_ext == "SVE");
-    cpu_isa.fp16 = (data_type == DataType::F16);
-
-    const auto *selected_impl = CpuLogits1DMaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
-
-    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_logits_1d_max";
-    std::string actual   = selected_impl->name;
-
-    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
-}
-
-DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(concat(
-                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                                                                                       framework::dataset::make("DataType", { DataType::F32,
-                                                                                                                DataType::F16,
-                                                                                                                DataType::QASYMM8,
-                                                                                                                DataType::QASYMM8_SIGNED
-                                                                                                                            })),
-                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
-                                                                                       framework::dataset::make("DataType", { DataType::F32,
-                                                                                                                DataType::F16
-                                                                                                                            }))),
-                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
-                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
-                                                                                                            DataType::QASYMM8_SIGNED
-                                                                                                                        }))),
-               cpu_ext, data_type)
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+    concat(concat(
+        combine(
+            make("CpuExt", std::string("NEON")),
+            make("DataType", { DataType::F32,
+                            DataType::F16,
+                            DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED})
+        ),
+        combine(
+            make("CpuExt", std::string("SVE")),
+            make("DataType", { DataType::F32,
+                            DataType::F16}))
+        ),
+        combine(
+            make("CpuExt", std::string("SVE2")),
+            make("DataType", { DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED}))
+        ),
+        cpu_ext, data_type)
 {
     using namespace cpu::kernels;
 
@@ -179,11 +150,12 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
     cpu_isa.sve2 = (cpu_ext == "SVE2");
     cpu_isa.fp16 = (data_type == DataType::F16);
 
-    const auto *selected_impl = CpuLogits1DSoftmaxKernel<false>::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+    const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
 
-    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_softmax_logits_1d";
+    std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax";
     std::string actual   = selected_impl->name;
 
     ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
@@ -192,26 +164,32 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 0, 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 0, 2, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -220,26 +198,30 @@ TEST_SUITE_END() //FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -252,29 +234,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor,
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, 1, -2 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.0f })
+        ),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -282,20 +275,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew
 TEST_SUITE_END() //QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, 1, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);