From fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Tue, 7 Nov 2023 05:43:07 +0000 Subject: Optimize CpuSoftmaxKernel for axis=0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a single kernel instead of having two consecutive ones. In the previous setup, one kernel was calculating the maximum value in the axis, and this maximum was being subtracted from each data while calculating the softmax, i.e. softmax(x_i) = exp(x_i - max) / sum_i( exp(x_i - max) ) This patch integrates these two stages into a single kernel for Neon™ for all data types. This will save some memory because we don't need to hold the max values in a separate auxiliary tensor. It also introduces some other optimizations that will ease memory pressure when the data type is float/half, by using the dst tensor as temporary storage for already exponentiated inputs. It removes the references to SVE and SVE2 implementations, and most of the associated files; but, it leaves the implementations as these may be used in the future. Resolves: COMPMID-6500 Signed-off-by: Gunes Bayir Change-Id: Icff9976d1214c4c6cbe15a62ca60b8a77d3784cc Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10688 Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- src/runtime/NEON/functions/NESoftmaxLayer.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index e3c2012d05..be588c5b52 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,7 +29,6 @@ #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" -#include "src/cpu/kernels/CpuSoftmaxKernel.h" #include "src/cpu/operators/CpuSoftmax.h" namespace arm_compute @@ -37,13 +36,12 @@ namespace arm_compute template struct NESoftmaxLayerGeneric::Impl { - const ITensor *src{nullptr}; - ITensor *dst{nullptr}; - Tensor max{nullptr}; - std::unique_ptr> op{nullptr}; - MemoryGroup memory_group{}; - ITensorPack run_pack{}; - WorkspaceData workspace_tensors{}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData workspace_tensors{}; }; template @@ -67,8 +65,8 @@ void NESoftmaxLayerGeneric::configure(ITensor *input, ITensor *output, f _impl->src = input; _impl->dst = output; - _impl->op = std::make_unique>(); - _impl->op->configure(input->info(), output->info(), beta, axis); + _impl->op = std::make_unique(); + _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG); _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); @@ -79,7 +77,7 @@ Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG)); return Status{}; } -- cgit v1.2.1