Optimize CpuSoftmaxKernel for axis=0

Implement a single kernel instead of having two consecutive ones. In the previous setup, one kernel was calculating the maximum value in the axis, and this maximum was being subtracted from each data while calculating the softmax, i.e. softmax(x_i) = exp(x_i - max) / sum_i( exp(x_i - max) ) This patch integrates these two stages into a single kernel for Neon™ for all data types. This will save some memory because we don't need to hold the max values in a separate auxiliary tensor. It also introduces some other optimizations that will ease memory pressure when the data type is float/half, by using the dst tensor as temporary storage for already exponentiated inputs. It removes the references to SVE and SVE2 implementations, and most of the associated files; but, it leaves the implementations as these may be used in the future. Resolves: COMPMID-6500 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Change-Id: Icff9976d1214c4c6cbe15a62ca60b8a77d3784cc Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10688 Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2023-11-07 05:43:07 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2023-12-05 13:52:17 +0000
commit: fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 (patch)
tree: 7d095fefe3634b4ca86dc9088bb2990d64d3a7c8 /src/cpu/operators/CpuSoftmax.h
parent: 23158b0a69b85c9c6e5a7f2457bfe10be04d6132 (diff)
download: ComputeLibrary-fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4.tar.gz
1 files changed, 16 insertions, 23 deletions
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 8cab70e14f..47020e9b7c 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
-#define ARM_COMPUTE_CPU_SOFTMAX_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
+#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -37,9 +37,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-class CpuLogits1DMaxKernel;
-template <bool IS_LOG>
-class CpuLogits1DSoftmaxKernel;
+class CpuSoftmaxKernel;
 
 /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
  *
@@ -52,31 +50,31 @@ class CpuLogits1DSoftmaxKernel;
  * This function runs the following function/kernels:
  * -# If axis is not 0:
  * -# @ref CpuPermute
- * -# @ref kernels::CpuLogits1DMaxKernel
- * -# @ref kernels::CpuLogits1DSoftmaxKernel
+ * -# @ref kernels::CpuSoftmaxKernel
  */
-template <bool IS_LOG = false>
 class CpuSoftmaxGeneric : public ICpuOperator
 {
 public:
     CpuSoftmaxGeneric();
     /** Set the input and output tensors.
      *
-     * @param[in,out] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     *                     last value of each row to the nearest multiple.
-     * @param[out]    dst  Destination tensor ifo. Data types supported: same as @p input.
-     * @param[in]     beta (Optional) A scaling factor for the exponent.
-     * @param[in]     axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     * @param[in,out] src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     *                       last value of each row to the nearest multiple.
+     * @param[out]    dst    Destination tensor ifo. Data types supported: same as @p input.
+     * @param[in]     beta   (Optional) A scaling factor for the exponent.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
      *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+     * @param[in]     is_log True if the operation is log-softmax
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuSoftmaxGeneric::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
 
     // Inherited methods overridden:
     void                             run(ITensorPack &tensors) override;
@@ -85,8 +83,7 @@ public:
 private:
     enum InternalTensorIdx
     {
-        MAX = 0,
-        TMP,
+        TMP = 0,
         PERMUTED_SRC,
         PERMUTED_DST,
         COUNT
@@ -94,10 +91,8 @@ private:
 
     CpuPermute                  _permute_input;
     CpuPermute                  _permute_output;
-    std::unique_ptr<ICPPKernel> _max_kernel;
     std::unique_ptr<ICPPKernel> _softmax_kernel;
 
-    TensorInfo _max;
     TensorInfo _tmp;
     TensorInfo _input_permuted;
     TensorInfo _output_permuted;
@@ -105,9 +100,7 @@ private:
     bool                             _needs_permute;
     experimental::MemoryRequirements _aux_mem{};
 };
-using CpuSoftmax    = CpuSoftmaxGeneric<false>;
-using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
 
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
author	Gunes Bayir <gunes.bayir@arm.com>	2023-11-07 05:43:07 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2023-12-05 13:52:17 +0000
commit	fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 (patch)
tree	7d095fefe3634b4ca86dc9088bb2990d64d3a7c8 /src/cpu/operators/CpuSoftmax.h
parent	23158b0a69b85c9c6e5a7f2457bfe10be04d6132 (diff)
download	ComputeLibrary-fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4.tar.gz