aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/assembly/arm_gemm.hpp
diff options
context:
space:
mode:
authorRadu Salavat <radu.salavat@arm.com>2024-02-27 18:32:26 +0000
committerRadu Salavat <radu.salavat@arm.com>2024-04-11 08:47:50 +0000
commitf1f1f87132690a8061801ef1a4638d637c780df7 (patch)
tree8ad4c3739217b3bc6281f4e0b9a7a63fe6c3f9bb /src/cpu/kernels/assembly/arm_gemm.hpp
parent1322065a3fbd15b00dbfb0969d6b438b5ba15530 (diff)
downloadComputeLibrary-f1f1f87132690a8061801ef1a4638d637c780df7.tar.gz
Add in place summation to CPU GEMM kernels
Instead of dispatching the sum postop for GEMM kernels to a separate kernel + add, that requires an extra destination sized allocation, plus 3 extra load/stores per element, just do it in the GEMM kernel. Resolves: ONCPUML-1442 Signed-off-by: Radu Salavat <radu.salavat@arm.com> Co-authored-by: Milos Puzovic <milos.puzovic@arm.com> Change-Id: I7a1f2da3300875fa1ac88b705a34390969518077 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11298 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/kernels/assembly/arm_gemm.hpp')
-rw-r--r--src/cpu/kernels/assembly/arm_gemm.hpp11
1 files changed, 10 insertions, 1 deletions
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 9a913c5c58..5d7cf79857 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,6 +21,10 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+
+#ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP
+#define ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP
+
#pragma once
#include "arm_gemm_local.hpp"
@@ -151,6 +155,7 @@ public:
int _maxthreads;
bool _fixed_format;
bool _fast_mode;
+ bool _accumulate;
const GemmConfig *_cfg;
GemmArgs(const CPUInfo *ci,
@@ -165,6 +170,7 @@ public:
const int maxthreads,
bool fixed_format = false,
bool fast_mode = false,
+ bool accumulate = false,
const GemmConfig *cfg = nullptr)
: _ci(ci),
_Msize(M),
@@ -178,6 +184,7 @@ public:
_maxthreads(maxthreads),
_fixed_format(fixed_format),
_fast_mode(fast_mode),
+ _accumulate(accumulate),
_cfg(cfg)
{
}
@@ -278,3 +285,5 @@ template <typename Top, typename Tret, class OutputStage = Nothing>
bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const OutputStage & = {});
} // namespace arm_gemm
+
+#endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP