Optimize CpuSoftmaxKernel for axis=0

Implement a single kernel instead of having two consecutive ones. In the previous setup, one kernel was calculating the maximum value in the axis, and this maximum was being subtracted from each data while calculating the softmax, i.e. softmax(x_i) = exp(x_i - max) / sum_i( exp(x_i - max) ) This patch integrates these two stages into a single kernel for Neon™ for all data types. This will save some memory because we don't need to hold the max values in a separate auxiliary tensor. It also introduces some other optimizations that will ease memory pressure when the data type is float/half, by using the dst tensor as temporary storage for already exponentiated inputs. It removes the references to SVE and SVE2 implementations, and most of the associated files; but, it leaves the implementations as these may be used in the future. Resolves: COMPMID-6500 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Change-Id: Icff9976d1214c4c6cbe15a62ca60b8a77d3784cc Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10688 Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2023-11-07 05:43:07 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2023-12-05 13:52:17 +0000
commit: fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 (patch)
tree: 7d095fefe3634b4ca86dc9088bb2990d64d3a7c8
parent: 23158b0a69b85c9c6e5a7f2457bfe10be04d6132 (diff)
download: ComputeLibrary-fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4.tar.gz
27 files changed, 631 insertions, 1061 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 13f4e9ea2a..ac4f0610ea 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -46,6 +46,8 @@ v24.01 Public major release
   You should link only to the main `libarm_compute` library for core functionality.
  - New features
    - Add support for FP16 in all multi_isa builds.
+ - Performance optimizations:
+   - Optimize @ref NESoftmaxLayer
 
 v23.11 Public major release
  - New features
@@ -438,8 +440,8 @@ v21.02 Public major release
    - @ref NEActivationLayer
    - @ref NEArithmeticAddition
    - @ref NEBatchNormalizationLayerKernel
-   - @ref cpu::kernels::CpuLogits1DSoftmaxKernel
-   - @ref cpu::kernels::CpuLogits1DMaxKernel
+   - cpu::kernels::CpuLogits1DSoftmaxKernel
+   - cpu::kernels::CpuLogits1DMaxKernel
    - @ref cpu::kernels::CpuElementwiseUnaryKernel
  - Remove padding from OpenCL kernels:
    - CLDirectConvolutionLayerKernel
diff --git a/filelist.json b/filelist.json
index c34eff2ff9..60f4285a03 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2213,16 +2213,10 @@
             "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
           },
           "sve": {
-            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ],
-            "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"],
-            "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"],
-            "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ],
-            "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]
+            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ]
           },
           "sve2":{
-            "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
-            "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"],
-            "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]
+            "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"]
           }
         }
       },
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f281b6a4d5..c14e10c836 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -117,9 +117,7 @@ filegroup(
 	"cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp",
 	"cpu/kernels/elementwise_unary/generic/sve2/q8.cpp",
 	"cpu/kernels/lut/generic/sve2/u8.cpp",
-	"cpu/kernels/softmax/generic/sve2/impl.cpp",
-	"cpu/kernels/softmax/generic/sve2/qasymm8.cpp",
-	"cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]  +
+	"cpu/kernels/softmax/generic/sve2/impl.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
@@ -342,11 +340,7 @@ filegroup(
 	"cpu/kernels/scale/sve/integer.cpp",
 	"cpu/kernels/scale/sve/qasymm8.cpp",
 	"cpu/kernels/scale/sve/qasymm8_signed.cpp",
-	"cpu/kernels/softmax/generic/sve/fp16.cpp",
-	"cpu/kernels/softmax/generic/sve/fp32.cpp",
-	"cpu/kernels/softmax/generic/sve/impl.cpp",
-	"cpu/kernels/softmax/generic/sve/qasymm8.cpp",
-	"cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]  +
+	"cpu/kernels/softmax/generic/sve/impl.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3229ffa8c2..e6c6782da1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -317,11 +317,7 @@ target_sources(
 	cpu/kernels/scale/sve/integer.cpp
 	cpu/kernels/scale/sve/qasymm8.cpp
 	cpu/kernels/scale/sve/qasymm8_signed.cpp
-	cpu/kernels/softmax/generic/sve/fp16.cpp
-	cpu/kernels/softmax/generic/sve/fp32.cpp
 	cpu/kernels/softmax/generic/sve/impl.cpp
-	cpu/kernels/softmax/generic/sve/qasymm8.cpp
-	cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
 )
 
 target_sources(
@@ -339,8 +335,6 @@ target_sources(
 	cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
 	cpu/kernels/lut/generic/sve2/u8.cpp
 	cpu/kernels/softmax/generic/sve2/impl.cpp
-	cpu/kernels/softmax/generic/sve2/qasymm8.cpp
-	cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
 )
 
 target_sources(
diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
index cec437d171..32d38a856c 100644
--- a/src/core/NEON/wrapper/intrinsics/max.h
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
+#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
 
 #include <arm_neon.h>
 
@@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMAX_IMPL
+
+#if defined(__aarch64__)
+// VMAXV: Across vector max
+#define VMAXV_IMPL(stype, vtype, prefix, postfix) \
+    inline stype vmaxv(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8)
+VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8)
+VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16)
+VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16)
+VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32)
+VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32)
+VMAXV_IMPL(float, float32x2_t, vmaxv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8)
+VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8)
+VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16)
+VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16)
+VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32)
+VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32)
+VMAXV_IMPL(float, float32x4_t, vmaxvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMAXV_IMPL
+#endif // defined(__aarch64__)
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index b7daa4d583..45ebeec394 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
-#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
+#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
 
 #include "arm_compute/core/Types.h"
 
@@ -99,6 +99,13 @@ struct ScaleKernelDataTypeISASelectorData
     InterpolationPolicy interpolation_policy;
 };
 
+struct SoftmaxKernelDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+    bool                is_log;
+};
+
 // Selector pointer types
 using DataTypeISASelectorPtr            = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
 using DataTypeDataLayoutSelectorPtr     = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
@@ -113,9 +120,10 @@ using CpuAddKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
 using ScaleKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
-
+using SoftmaxKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type;
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index ce144351f8..486f55e2c1 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,9 +34,12 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/softmax/list.h"
 
+#include <vector>
+
 namespace arm_compute
 {
 namespace cpu
@@ -45,136 +48,40 @@ namespace kernels
 {
 namespace
 {
-/* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
-    {"sve_fp32_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_logits)},
-    {"sve_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_logits)},
-    {"sve_qu8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
-     REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
-    {"sve_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
-     REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
-    {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_logits)},
-    {"neon_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_logits)},
-    {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
-    {"neon_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
+/* Softmax */
+static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+    {"neon_fp32_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<false>)},
+    {"neon_fp16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<false>)},
+    {"neon_qu8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)},
+    {"neon_qs8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)},
+    {"neon_fp32_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<true>)},
+    {"neon_fp16_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<true>)},
+    {"neon_qu8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)},
+    {"neon_qs8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
 };
 
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if (output.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
-                                                           TensorShape(input.tensor_shape()).set(0, 1));
-    }
-
-    return Status{};
-}
-} //namespace
-const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels()
-{
-    return available_kernels_max_logits;
-}
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
-    // Softmax across the x dimension
-    const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
-    Window win = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
-    return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
-    return _name.c_str();
-}
-
-/* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
-template <bool IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
-    {"sve2_qu8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
-     REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
-    {"sve2_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
-     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
-    {"sve_fp32_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_softmax)},
-    {"sve_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_softmax)},
-
-    {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_softmax)},
-    {"neon_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_softmax)},
-    {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
-    {"neon_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
-};
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src,
-                                         const ITensorInfo &max,
-                                         const ITensorInfo &dst,
-                                         const float        beta,
-                                         const ITensorInfo &tmp,
-                                         bool               is_log)
+Status validate_arguments_softmax(
+    const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log)
 {
     ARM_COMPUTE_UNUSED(beta);
     // Check input
@@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
-    // Check max
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
     // Check output if configured
     if (dst.total_size() != 0)
     {
@@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
     // Check tmp if configured
     if (tmp.total_size() != 0)
     {
-        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
-        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
+        // We have temporary storage only if src data type is quantized.
+        // Therefore, tmp data type must be F32
+        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric);
+
         // We could potentially reduce tmp memory if we could predict or make an assumption
         // on the maximum number of threads that will run in parallel.
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
@@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 }
 } // namespace
 
-template <bool IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
-CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels()
 {
-    return available_kernels_logits<IS_LOG>;
+    return available_kernels;
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
-    const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     // Configure kernel window
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
 
     // Output auto initialization if not yet initialized
     const QuantizationInfo output_quantization =
-        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log)
                                 : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
 
-    // Tmp auto initialization if not yet initialized
-    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
-    auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    // Tmp auto initialization if not yet initialized and src is quantized
+    if (is_quantized_asymmetric)
+    {
+        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
+        auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    }
 
-    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
-        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    const auto *uk = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    std::string kernel_name =
-        IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+    std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
 
     _beta       = beta;
     _run_method = uk->ukernel;
     _name       = kernel_name.append("/").append(uk->name);
 
-    // Configure kernel window
-    Window win = calculate_max_window(*max, Steps());
+    Window win = calculate_max_window(*dst, Steps());
+
+    /// TODO: Check dimensions > 0 for holes only. For this, we need
+    /// a utility function checking if there are holes after some dimension.
+    if (!has_holes(*dst, dst->num_dimensions() - 1))
+    {
+        win = win.collapse(win, Window::DimY);
+    }
 
-    ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::configure(win);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis
+
+    ICpuKernel<CpuSoftmaxKernel>::configure(win);
 }
 
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
-    const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuSoftmaxKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     return Status{};
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
 
-    const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
+
+        const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
+        const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
-    ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
+        ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
-    void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-    _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
+        void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
+        _run_method(src, tmp_for_thread, dst, _beta, window);
+    }
+    else
+    {
+        _run_method(src, nullptr, dst, _beta, window);
+    }
 }
 
-template <bool IS_LOG>
-const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuSoftmaxKernel::name() const
 {
     return _name.c_str();
 }
 
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 5d288179fd..3db1f3d0ef 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
@@ -33,102 +33,55 @@ namespace cpu
 {
 namespace kernels
 {
-/** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel>
+/** Interface for softmax computation */
+class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel>
 {
 private:
-    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+    using SoftmaxKernelPtr =
+        std::add_pointer<void(const ITensor *, void *const, ITensor *, float, const Window &)>::type;
 
 public:
-    CpuLogits1DMaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst Destination tensor info. Data types supported: same as @p input
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuLogits1DMaxKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-    struct SoftmaxLogits1DMaxKernel
-    {
-        const char                  *name;
-        const DataTypeISASelectorPtr is_selected;
-        SoftmaxLogits1DMaxKernelPtr  ukernel;
-    };
-
-    static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
-
-private:
-    SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
-    std::string                 _name{};
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
-{
-private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
-        const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-public:
-    CpuLogits1DSoftmaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
+    CpuSoftmaxKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel);
 
     /** Set the input and output tensors.
      *
-     * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  max  Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                  Data types supported: same as @p input.
-     * @param[out] dst  Destination tensor info. Data types supported: same as @p input.
-     * @param[in]  beta A scaling factor for the exponent.
+     * @param[in]  src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst    Destination tensor info. Data types supported: same as @p input.
+     * @param[in]  beta   A scaling factor for the exponent.
+     * @param[in]  is_log True if the operation is log-softmax
      *
      * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
      */
-    void
-    configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp);
     /** Static function to check if given info will lead to a valid configuration
      *
-     * Similar to CpuLogits1DSoftmaxKernel::configure()
+     * Similar to CpuSoftmaxKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src,
-                           const ITensorInfo *max,
-                           const ITensorInfo *dst,
-                           const float        beta,
-                           const ITensorInfo *tmp);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-    struct SoftmaxLogits1DKernel
+    struct SoftmaxKernel
     {
-        const char                  *name;
-        const DataTypeISASelectorPtr is_selected;
-        SoftmaxLogits1DKernelPtr     ukernel;
+        const char                                   *name;
+        const SoftmaxKernelDataTypeISASelectorDataPtr is_selected;
+        SoftmaxKernelPtr                              ukernel;
     };
 
-    static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
+    static const std::vector<SoftmaxKernel> &get_available_kernels();
 
 private:
-    float                    _beta{1.0f};
-    SoftmaxLogits1DKernelPtr _run_method{nullptr};
-    std::string              _name{};
+    float            _beta{1.0f};
+    SoftmaxKernelPtr _run_method{nullptr};
+    std::string      _name{};
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index 2e2adf33e0..db8f881712 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -31,21 +31,18 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float16_t>(in, out, window);
+    return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index 61df40c1b5..c281d1bf31 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float>(in, out, window);
+    return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 5d6e6a4f80..487f6ae051 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -29,43 +29,76 @@ namespace arm_compute
 {
 namespace cpu
 {
-template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(
-    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
     static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
                   "quantized type should be either qasymm8_t or qasymm8_signed_t.");
 
-    const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
 
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
+    const float       scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
 
-    Iterator      in_it(in, window);
-    Iterator      max_it(max, window);
-    Iterator      out_it(out, window);
     constexpr int vec_size = 16;
 
+#ifndef __aarch64__
+    const int sum_stages = log2(vec_size >> 1);
+#endif // __aarch64__
+
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+            float   *tmp_ptr = reinterpret_cast<float *>(tmp);
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
 
-            float sum{};
-            float sum_inversed{};
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
+
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // Compute Max
+
+            float sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
 
                 /* Init sum to zero */
@@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized(
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                    vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                    auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+                    auto vec_elements              = wrapper::vloadq(in_ptr + x);
+                    vec_elements                   = wrapper::vqsub(vec_max, vec_elements);
+                    float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
                         vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
@@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized(
                 }
 
                 /* Reduce sum */
-                const auto sum_16_byte =
+                const float32x4_t sum_16_byte =
                     vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+
+                float sum;
+
+#ifdef __aarch64__
+                sum = wrapper::vaddv(sum_16_byte);
+#else  // __aarch64__
                 auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
                 sum_res      = vpadd_f32(sum_res, sum_res);
                 sum          = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     float element{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (max_val - in_ptr[x]) * scale_beta;
                         sum += std::exp(element);
@@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized(
                     tmp_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = 256.f / sum;
+                    sum_transformed = 256.f / sum;
                 }
                 else
                 {
-                    sum = std::log(sum);
+                    sum_transformed = std::log(sum);
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
                 constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+
+                const float32x4_t sum_vec = vdupq_n_f32(sum_transformed);
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
@@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized(
                     using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
                     float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
                     int_vec_type  normalized_value{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         const float32x4x4_t sub = {
-                            vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[0], sum_vec),
+                            vsubq_f32(vec_in.val[1], sum_vec),
+                            vsubq_f32(vec_in.val[2], sum_vec),
+                            vsubq_f32(vec_in.val[3], sum_vec),
                         };
                         normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
                     }
                     else
                     {
                         float32x4x4_t mul = {
-                            vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[0], sum_vec),
+                            vmulq_f32(vec_in.val[1], sum_vec),
+                            vmulq_f32(vec_in.val[2], sum_vec),
+                            vmulq_f32(vec_in.val[3], sum_vec),
                         };
 
                         if (is_qasymm8_signed)
@@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized(
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed);
                     }
                     else
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) -
                                                                    (is_qasymm8_signed ? 128.f : 0));
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
 
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
+template void neon_softmax_quantized<qasymm8_signed_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_signed_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 4d9b789297..60380cd233 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
 
@@ -33,105 +33,100 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{window};
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(
-        win,
-        [&](const Coordinates &)
-        {
-            // Get pointers
-            const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Init max value
-            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-            int  x       = window_start_x;
-
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto current_value = wrapper::vloadq(in_ptr + x);
-                vec_max                  = wrapper::vmax(vec_max, current_value);
-            }
-            auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-            for (int i = 0; i < sum_stages; ++i)
-            {
-                carry_max = wrapper::vpmax(carry_max, carry_max);
-            }
-            T max_val = wrapper::vgetlane(carry_max, 0);
 
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-            }
+#ifdef __aarch64__
+namespace
+{
+// These helper functions are added because vaddv does not exist for fp16,
+// and, therefore, is not part of the wrapper::vaddv interface.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages)
+{
+    auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a));
+    for (int i = 0; i < sum_stages; ++i)
+    {
+        sum_res = wrapper::vpadd(sum_res, sum_res);
+    }
+    return wrapper::vgetlane(sum_res, 0);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-            *out_ptr = max_val;
-        },
-        input, output);
+inline float wrapper_vaddv(const float32x4_t &a, int sum_stages)
+{
+    ARM_COMPUTE_UNUSED(sum_stages);
+    return wrapper::vaddv(a);
 }
+} // namespace
+#endif // __aarch64__
 
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in,
-                                      const ITensor *max,
-                                      void *const    tmp,
-                                      ITensor       *out,
-                                      float          beta,
-                                      bool           is_log,
-                                      const Window  &window);
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in,
-                                  const ITensor *max,
-                                  void *const    tmp,
-                                  ITensor       *out,
-                                  const float    beta,
-                                  bool           is_log,
-                                  const Window  &window)
+// The template implementation for float data types is stored in the header file because
+// we need all fp16 instantiated code to live in fp16.cpp files.
+template <typename T, bool IS_LOG>
+void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
-    const int start_x     = in->info()->valid_region().anchor.x();
+    ARM_COMPUTE_UNUSED(tmp);
+
     const int input_width = in->info()->valid_region().shape.x();
 
     Iterator in_it(in, window);
-    Iterator max_it(max, window);
     Iterator out_it(out, window);
 
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
+    constexpr int vec_size = 16 / sizeof(T);
+
+    const int sum_stages = log2(vec_size >> 1);
+
+    const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
 
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
+
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
 
-            T sum{};
-            T sum_inversed{};
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // compute max
+
+            T sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
 
                 /* Init sum to zero */
@@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                 {
                     auto vec_elements = wrapper::vloadq(in_ptr + x);
                     vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        vec_elements =
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                        vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                        vec_elements = wrapper::vmul(vec_elements, beta_vec);
+                        vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
                     }
                     else
                     {
-                        vec_elements = wrapper::vexpq(
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                        vec_sum = wrapper::vadd(vec_sum, vec_elements);
+                        vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));
+                        vec_sum      = wrapper::vadd(vec_sum, vec_elements);
                     }
-                    wrapper::vstore(tmp_ptr + x, vec_elements);
+                    wrapper::vstore(out_ptr + x, vec_elements);
                 }
 
                 /* Reduce sum */
+                T sum{};
+#ifdef __aarch64__
+                sum = wrapper_vaddv(vec_sum, sum_stages);
+#else  // __aarch64__
                 auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
                 for (int i = 0; i < sum_stages; ++i)
                 {
                     sum_res = wrapper::vpadd(sum_res, sum_res);
                 }
                 sum = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     T element{};
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (in_ptr[x] - max_val) * beta;
                         sum += std::exp(element);
@@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                         element = std::exp((in_ptr[x] - max_val) * beta);
                         sum += element;
                     }
-                    tmp_ptr[x] = element;
+
+                    out_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = T(1) / sum;
+                    sum_transformed = T(1) / sum;
                 }
                 else
                 {
-                    sum = static_cast<T>(std::log(sum));
+                    sum_transformed = static_cast<T>(std::log(sum));
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
+                const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{});
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                    auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                    if (is_log)
+                    const auto vec_in = wrapper::vloadq(out_ptr + x);
+                    if (IS_LOG)
                     {
-                        normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec));
                     }
                     else
                     {
-                        normalized_value =
-                            wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec));
                     }
-                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
+
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = tmp_ptr[x] - sum;
+                        out_ptr[x] = out_ptr[x] - sum_transformed;
                     }
                     else
                     {
-                        out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                        out_ptr[x] = out_ptr[x] * sum_transformed;
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
+
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index 40713dc496..9589ebcd7c 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_t>(in, out, window);
-}
+template void
+neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 2c5e284f54..0bf6b2859a 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_signed_softmax(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
+template void neon_qasymm8_signed_softmax<true>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void neon_qasymm8_signed_softmax<false>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
deleted file mode 100644
index 5e94f72faf..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2021-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/CpuTypes.h"
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp16_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
-{
-    return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<float16_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
deleted file mode 100644
index d692cc2477..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp32_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
-{
-    return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<float>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 24f1bb8143..0d4b7f4509 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
 {
@@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in,
         },
         in_it, max_it, out_it);
 }
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in,
-                                                 const ITensor *max,
-                                                 void *const    tmp,
-                                                 ITensor       *out,
-                                                 const float    beta,
-                                                 bool           is_log,
-                                                 const Window  &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
-                                                     const ITensor *max,
-                                                     void *const    tmp,
-                                                     ITensor       *out,
-                                                     const float    beta,
-                                                     bool           is_log,
-                                                     const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
deleted file mode 100644
index 85e5ccfea1..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
deleted file mode 100644
index 4be2e2eed6..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 98b2f5117f..a8fb1d4adf 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE2 code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve2_softmax_logits_1d_quantized(
     const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
@@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized(
         },
         in_it, max_it, out_it);
 }
-
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
deleted file mode 100644
index 95623786b3..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
deleted file mode 100644
index c20462fcef..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index 627ce0c264..c143f6659d 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,41 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SOFTMAX_KERNEL(func_name)                                                                  \
-    void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
-                   bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name) \
+    template <bool IS_LOG>                \
+    void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 
 DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
 
 #undef DECLARE_SOFTMAX_KERNEL
-
-#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
-
-DECLARE_LOGITS_KERNEL(neon_fp32_logits);
-DECLARE_LOGITS_KERNEL(neon_fp16_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits);
-DECLARE_LOGITS_KERNEL(sve_fp32_logits);
-DECLARE_LOGITS_KERNEL(sve_fp16_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits);
-
-#undef DECLARE_LOGITS_KERNEL
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index e55d7f903e..ae14381ad9 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,13 +41,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <bool IS_LOG>
-CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
+CpuSoftmaxGeneric::CpuSoftmaxGeneric()
     : _permute_input(),
       _permute_output(),
-      _max_kernel(),
       _softmax_kernel(),
-      _max(),
       _tmp(),
       _input_permuted(),
       _output_permuted(),
@@ -56,8 +53,7 @@ CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
 {
 }
 
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis)
+void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -79,29 +75,23 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     // or it is the original input case (2D case)
     const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src);
 
-    // Create intermediate tensors shapes
-    TensorShape max_sum_shape = tmp_input->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type =
-        is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
-    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+    TensorInfo tensor_info_tmp;
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        // Create intermediate tensors shapes
+        const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+        tensor_info_tmp             = input_info.clone()->set_data_type(DataType::F32);
+    }
 
     // Init intermediate tensors
-    _max = TensorInfo(max_info);
     _tmp = TensorInfo(tensor_info_tmp);
 
     // Configure kernels
-    auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>();
-    mk->configure(tmp_input, &_max);
-    _max_kernel = std::move(mk);
-
-    auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
+    auto sm = std::make_unique<kernels::CpuSoftmaxKernel>();
     if (_needs_permute)
     {
         // The normalization kernel stores the result in a permuted output tensor
-        sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+        sm->configure(tmp_input, &_output_permuted, beta, is_log, &_tmp);
 
         // Re-permute the permuted output into the requested (4D) output
         _permute_output.configure(&_output_permuted, dst,
@@ -110,14 +100,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     else
     {
         // Softmax 2D case
-        sm->configure(tmp_input, &_max, dst, beta, &_tmp);
+        sm->configure(tmp_input, dst, beta, is_log, &_tmp);
     }
     _softmax_kernel = std::move(sm);
 
-    _aux_mem[InternalTensorIdx::MAX] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
-    _aux_mem[InternalTensorIdx::TMP] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    if (_tmp.total_size() > 0)
+    {
+        _aux_mem[InternalTensorIdx::TMP] =
+            MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    }
 
     _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
                                                            MemoryLifetime::Temporary, _input_permuted.total_size());
@@ -125,8 +116,8 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
                                                            MemoryLifetime::Temporary, _output_permuted.total_size());
 }
 
-template <bool IS_LOG>
-Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis)
+Status
+CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
@@ -136,17 +127,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
                                 static_cast<int32_t>(src->num_dimensions()) <= axis);
 
     // Create intermediate tensor info
-    DataType         tmp_data_type = src->data_type();
-    const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = src->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(src->clone()
-                                             ->set_tensor_shape(max_sum_shape)
-                                             .set_data_type(tmp_data_type)
-                                             .set_quantization_info(src->quantization_info())
-                                             .set_is_resizable(true));
-    const TensorInfo dont_care;
+    TensorInfo tensor_info_tmp;
+
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true);
+    }
 
     const unsigned int actual_axis =
         static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
@@ -165,15 +151,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
-        &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuSoftmaxKernel::validate(src, dst, beta, is_log, &tensor_info_tmp));
 
     return Status{};
 }
 
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
+void CpuSoftmaxGeneric::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
 
@@ -181,13 +164,11 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     auto dst = tensors.get_tensor(TensorType::ACL_DST);
 
     CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true);
-    CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
 
     CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
     CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
                                         true);
 
-    ITensorPack max_pack;
     ITensorPack softmax_pack;
 
     if (_needs_permute)
@@ -195,24 +176,15 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
         ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
         _permute_input.run(permute_in_pack);
 
-        max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
-
         softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
-                        {TensorType::ACL_SRC_1, max.get()},
                         {TensorType::ACL_DST_0, output_permuted.get()},
                         {TensorType::ACL_DST_1, tmp.get()}};
     }
     else
     {
-        max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
-
-        softmax_pack = {{TensorType::ACL_SRC_0, src},
-                        {TensorType::ACL_SRC_1, max.get()},
-                        {TensorType::ACL_DST_0, dst},
-                        {TensorType::ACL_DST_1, tmp.get()}};
+        softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}};
     }
 
-    NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
     NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
 
     if (_needs_permute)
@@ -224,13 +196,10 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     }
 }
 
-template <bool IS_LOG>
-experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
+experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const
 {
     return _aux_mem;
 }
 
-template class CpuSoftmaxGeneric<false>;
-template class CpuSoftmaxGeneric<true>;
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 8cab70e14f..47020e9b7c 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
-#define ARM_COMPUTE_CPU_SOFTMAX_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
+#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -37,9 +37,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-class CpuLogits1DMaxKernel;
-template <bool IS_LOG>
-class CpuLogits1DSoftmaxKernel;
+class CpuSoftmaxKernel;
 
 /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
  *
@@ -52,31 +50,31 @@ class CpuLogits1DSoftmaxKernel;
  * This function runs the following function/kernels:
  * -# If axis is not 0:
  * -# @ref CpuPermute
- * -# @ref kernels::CpuLogits1DMaxKernel
- * -# @ref kernels::CpuLogits1DSoftmaxKernel
+ * -# @ref kernels::CpuSoftmaxKernel
  */
-template <bool IS_LOG = false>
 class CpuSoftmaxGeneric : public ICpuOperator
 {
 public:
     CpuSoftmaxGeneric();
     /** Set the input and output tensors.
      *
-     * @param[in,out] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     *                     last value of each row to the nearest multiple.
-     * @param[out]    dst  Destination tensor ifo. Data types supported: same as @p input.
-     * @param[in]     beta (Optional) A scaling factor for the exponent.
-     * @param[in]     axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     * @param[in,out] src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     *                       last value of each row to the nearest multiple.
+     * @param[out]    dst    Destination tensor ifo. Data types supported: same as @p input.
+     * @param[in]     beta   (Optional) A scaling factor for the exponent.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
      *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+     * @param[in]     is_log True if the operation is log-softmax
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuSoftmaxGeneric::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
 
     // Inherited methods overridden:
     void                             run(ITensorPack &tensors) override;
@@ -85,8 +83,7 @@ public:
 private:
     enum InternalTensorIdx
     {
-        MAX = 0,
-        TMP,
+        TMP = 0,
         PERMUTED_SRC,
         PERMUTED_DST,
         COUNT
@@ -94,10 +91,8 @@ private:
 
     CpuPermute                  _permute_input;
     CpuPermute                  _permute_output;
-    std::unique_ptr<ICPPKernel> _max_kernel;
     std::unique_ptr<ICPPKernel> _softmax_kernel;
 
-    TensorInfo _max;
     TensorInfo _tmp;
     TensorInfo _input_permuted;
     TensorInfo _output_permuted;
@@ -105,9 +100,7 @@ private:
     bool                             _needs_permute;
     experimental::MemoryRequirements _aux_mem{};
 };
-using CpuSoftmax    = CpuSoftmaxGeneric<false>;
-using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
 
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index e3c2012d05..be588c5b52 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
 
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "src/cpu/operators/CpuSoftmax.h"
 
 namespace arm_compute
@@ -37,13 +36,12 @@ namespace arm_compute
 template <bool IS_LOG>
 struct NESoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ITensor                                  *src{nullptr};
-    ITensor                                        *dst{nullptr};
-    Tensor                                          max{nullptr};
-    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr};
-    MemoryGroup                                     memory_group{};
-    ITensorPack                                     run_pack{};
-    WorkspaceData<Tensor>                           workspace_tensors{};
+    const ITensor                          *src{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr};
+    MemoryGroup                             memory_group{};
+    ITensorPack                             run_pack{};
+    WorkspaceData<Tensor>                   workspace_tensors{};
 };
 
 template <bool IS_LOG>
@@ -67,8 +65,8 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
 
     _impl->src = input;
     _impl->dst = output;
-    _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
-    _impl->op->configure(input->info(), output->info(), beta, axis);
+    _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric>();
+    _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG);
 
     _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
@@ -79,7 +77,7 @@ Status
 NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG));
     return Status{};
 }
 
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index b372bdf3fa..2397d81547 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -22,14 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -42,6 +40,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -53,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -66,53 +65,53 @@ TEST_SUITE(NEON)
 TEST_SUITE(SoftmaxLayer)
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", { 0,
-                                                  0,
-                                                  0,
-                                                  1,
-                                                  0,
-                                                  -1,
-                                                  2,
-                                                  -3,
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
+                                    QuantizationInfo(1.f/256, 12)),
+                        }),
+    make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        }),
+    make("beta", { 1.0,
+                   2.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                }),
+    make("axis", { 0,
+                   0,
+                   0,
+                   1,
+                   0,
+                   -1,
+                   2,
+                   -3,
+                }),
+    make("Expected", { false, false, false, true, true, true, false, false })),
+    input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
 }
@@ -122,54 +121,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
-DATA_TEST_CASE(KernelSelection_max_logits, framework::DatasetMode::ALL, concat(
-                   combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                           framework::dataset::make("DataType", { DataType::F32,
-                                                                  DataType::F16,
-                                                                  DataType::QASYMM8,
-                                                                  DataType::QASYMM8_SIGNED
-                                                                })),
-                   combine(framework::dataset::make("CpuExt", std::string("SVE")),
-                           framework::dataset::make("DataType", { DataType::F32,
-                                                                  DataType::F16,
-                                                                  DataType::QASYMM8,
-                                                                  DataType::QASYMM8_SIGNED
-                                                                }))),
-               cpu_ext, data_type)
-{
-    using namespace cpu::kernels;
-
-    cpuinfo::CpuIsaInfo cpu_isa{};
-    cpu_isa.neon = (cpu_ext == "NEON");
-    cpu_isa.sve  = (cpu_ext == "SVE");
-    cpu_isa.fp16 = (data_type == DataType::F16);
-
-    const auto *selected_impl = CpuLogits1DMaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
-
-    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_logits_1d_max";
-    std::string actual   = selected_impl->name;
-
-    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
-}
-
-DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(concat(
-                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                                                                                       framework::dataset::make("DataType", { DataType::F32,
-                                                                                                                DataType::F16,
-                                                                                                                DataType::QASYMM8,
-                                                                                                                DataType::QASYMM8_SIGNED
-                                                                                                                            })),
-                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
-                                                                                       framework::dataset::make("DataType", { DataType::F32,
-                                                                                                                DataType::F16
-                                                                                                                            }))),
-                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
-                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
-                                                                                                            DataType::QASYMM8_SIGNED
-                                                                                                                        }))),
-               cpu_ext, data_type)
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+    concat(concat(
+        combine(
+            make("CpuExt", std::string("NEON")),
+            make("DataType", { DataType::F32,
+                            DataType::F16,
+                            DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED})
+        ),
+        combine(
+            make("CpuExt", std::string("SVE")),
+            make("DataType", { DataType::F32,
+                            DataType::F16}))
+        ),
+        combine(
+            make("CpuExt", std::string("SVE2")),
+            make("DataType", { DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED}))
+        ),
+        cpu_ext, data_type)
 {
     using namespace cpu::kernels;
 
@@ -179,11 +150,12 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
     cpu_isa.sve2 = (cpu_ext == "SVE2");
     cpu_isa.fp16 = (data_type == DataType::F16);
 
-    const auto *selected_impl = CpuLogits1DSoftmaxKernel<false>::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+    const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
 
-    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_softmax_logits_1d";
+    std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax";
     std::string actual   = selected_impl->name;
 
     ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
@@ -192,26 +164,32 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 0, 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 0, 2, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -220,26 +198,30 @@ TEST_SUITE_END() //FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -252,29 +234,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor,
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, 1, -2 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.0f })
+        ),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -282,20 +275,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew
 TEST_SUITE_END() //QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, 1, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
author	Gunes Bayir <gunes.bayir@arm.com>	2023-11-07 05:43:07 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2023-12-05 13:52:17 +0000
commit	fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 (patch)
tree	7d095fefe3634b4ca86dc9088bb2990d64d3a7c8
parent	23158b0a69b85c9c6e5a7f2457bfe10be04d6132 (diff)
download	ComputeLibrary-fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4.tar.gz