Optimize CpuSoftmaxKernel for axis=0

Implement a single kernel instead of having two consecutive ones. In the previous setup, one kernel was calculating the maximum value in the axis, and this maximum was being subtracted from each data while calculating the softmax, i.e. softmax(x_i) = exp(x_i - max) / sum_i( exp(x_i - max) ) This patch integrates these two stages into a single kernel for Neon™ for all data types. This will save some memory because we don't need to hold the max values in a separate auxiliary tensor. It also introduces some other optimizations that will ease memory pressure when the data type is float/half, by using the dst tensor as temporary storage for already exponentiated inputs. It removes the references to SVE and SVE2 implementations, and most of the associated files; but, it leaves the implementations as these may be used in the future. Resolves: COMPMID-6500 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Change-Id: Icff9976d1214c4c6cbe15a62ca60b8a77d3784cc Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10688 Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2023-11-07 05:43:07 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2023-12-05 13:52:17 +0000
commit: fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 (patch)
tree: 7d095fefe3634b4ca86dc9088bb2990d64d3a7c8 /src/cpu/kernels/CpuSoftmaxKernel.cpp
parent: 23158b0a69b85c9c6e5a7f2457bfe10be04d6132 (diff)
download: ComputeLibrary-fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4.tar.gz
1 files changed, 87 insertions, 176 deletions
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index ce144351f8..486f55e2c1 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,9 +34,12 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/softmax/list.h"
 
+#include <vector>
+
 namespace arm_compute
 {
 namespace cpu
@@ -45,136 +48,40 @@ namespace kernels
 {
 namespace
 {
-/* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
-    {"sve_fp32_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_logits)},
-    {"sve_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_logits)},
-    {"sve_qu8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
-     REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
-    {"sve_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
-     REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
-    {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_logits)},
-    {"neon_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_logits)},
-    {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
-    {"neon_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
+/* Softmax */
+static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+    {"neon_fp32_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<false>)},
+    {"neon_fp16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<false>)},
+    {"neon_qu8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)},
+    {"neon_qs8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)},
+    {"neon_fp32_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<true>)},
+    {"neon_fp16_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<true>)},
+    {"neon_qu8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)},
+    {"neon_qs8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
 };
 
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if (output.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
-                                                           TensorShape(input.tensor_shape()).set(0, 1));
-    }
-
-    return Status{};
-}
-} //namespace
-const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels()
-{
-    return available_kernels_max_logits;
-}
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
-    // Softmax across the x dimension
-    const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
-    Window win = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
-    return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
-    return _name.c_str();
-}
-
-/* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
-template <bool IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
-    {"sve2_qu8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
-     REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
-    {"sve2_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
-     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
-    {"sve_fp32_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_softmax)},
-    {"sve_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_softmax)},
-
-    {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_softmax)},
-    {"neon_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_softmax)},
-    {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
-    {"neon_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
-};
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src,
-                                         const ITensorInfo &max,
-                                         const ITensorInfo &dst,
-                                         const float        beta,
-                                         const ITensorInfo &tmp,
-                                         bool               is_log)
+Status validate_arguments_softmax(
+    const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log)
 {
     ARM_COMPUTE_UNUSED(beta);
     // Check input
@@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
-    // Check max
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
     // Check output if configured
     if (dst.total_size() != 0)
     {
@@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
     // Check tmp if configured
     if (tmp.total_size() != 0)
     {
-        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
-        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
+        // We have temporary storage only if src data type is quantized.
+        // Therefore, tmp data type must be F32
+        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric);
+
         // We could potentially reduce tmp memory if we could predict or make an assumption
         // on the maximum number of threads that will run in parallel.
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
@@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 }
 } // namespace
 
-template <bool IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
-CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels()
 {
-    return available_kernels_logits<IS_LOG>;
+    return available_kernels;
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
-    const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     // Configure kernel window
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
 
     // Output auto initialization if not yet initialized
     const QuantizationInfo output_quantization =
-        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log)
                                 : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
 
-    // Tmp auto initialization if not yet initialized
-    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
-    auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    // Tmp auto initialization if not yet initialized and src is quantized
+    if (is_quantized_asymmetric)
+    {
+        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
+        auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    }
 
-    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
-        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    const auto *uk = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    std::string kernel_name =
-        IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+    std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
 
     _beta       = beta;
     _run_method = uk->ukernel;
     _name       = kernel_name.append("/").append(uk->name);
 
-    // Configure kernel window
-    Window win = calculate_max_window(*max, Steps());
+    Window win = calculate_max_window(*dst, Steps());
+
+    /// TODO: Check dimensions > 0 for holes only. For this, we need
+    /// a utility function checking if there are holes after some dimension.
+    if (!has_holes(*dst, dst->num_dimensions() - 1))
+    {
+        win = win.collapse(win, Window::DimY);
+    }
 
-    ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::configure(win);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis
+
+    ICpuKernel<CpuSoftmaxKernel>::configure(win);
 }
 
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
-    const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuSoftmaxKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     return Status{};
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
 
-    const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
+
+        const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
+        const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
-    ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
+        ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
-    void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-    _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
+        void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
+        _run_method(src, tmp_for_thread, dst, _beta, window);
+    }
+    else
+    {
+        _run_method(src, nullptr, dst, _beta, window);
+    }
 }
 
-template <bool IS_LOG>
-const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuSoftmaxKernel::name() const
 {
     return _name.c_str();
 }
 
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
author	Gunes Bayir <gunes.bayir@arm.com>	2023-11-07 05:43:07 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2023-12-05 13:52:17 +0000
commit	fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 (patch)
tree	7d095fefe3634b4ca86dc9088bb2990d64d3a7c8 /src/cpu/kernels/CpuSoftmaxKernel.cpp
parent	23158b0a69b85c9c6e5a7f2457bfe10be04d6132 (diff)
download	ComputeLibrary-fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4.tar.gz