From f1f1f87132690a8061801ef1a4638d637c780df7 Mon Sep 17 00:00:00 2001
From: Radu Salavat <radu.salavat@arm.com>
Date: Tue, 27 Feb 2024 18:32:26 +0000
Subject: Add in place summation to CPU GEMM kernels

Instead of dispatching the sum postop for GEMM kernels to a
separate kernel + add, that requires an extra destination sized
allocation, plus 3 extra load/stores per element,
just do it in the GEMM kernel.

Resolves: ONCPUML-1442

Signed-off-by: Radu Salavat <radu.salavat@arm.com>
Co-authored-by: Milos Puzovic <milos.puzovic@arm.com>
Change-Id: I7a1f2da3300875fa1ac88b705a34390969518077
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11298
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 4f732f7d94..d8b464584a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -350,6 +350,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     const bool _thread_columns;
 
     const Activation _act;
+    const bool _accumulate;
 
     const int _maxthreads;
     int _nthreads;
@@ -680,7 +681,7 @@ public:
                       _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
                       _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
                       _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _act(args._act), _accumulate(args._accumulate), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
                       _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
                       _os(os) { }
 
@@ -690,7 +691,7 @@ public:
                       _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
                       _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
                       _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _act(args._act), _accumulate(args._accumulate),  _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
                       _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
                       _os() { }
 
@@ -823,7 +824,7 @@ public:
                             // Only do bias on the first pass
                             ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
                             // Only do activation on the last pass, and accumulation on any non-first pass.
-                            (last_pass ? _act : Activation()), !first_pass,
+                            (last_pass ? _act : Activation()), (!first_pass || _accumulate),
                             // Pass in quantization parameters for requantizing kernels (others will ignore)
                             _os, col_bias + (multi * _Nsize),
                             // Accumulation buffer
@@ -971,7 +972,7 @@ public:
                             // Only do bias on the first pass
                             ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
                             // Only do activation on the last pass, and accumulation on any non-first pass.
-                            (last_pass ? _act : Activation()), !first_pass,
+                            (last_pass ? _act : Activation()), (!first_pass || _accumulate),
                             // Pass in quantization parameters for requantizing kernels (others will ignore)
                             _os, col_bias + (current.multi() * _Nsize),
                             // Accumulation buffer
-- 
cgit v1.2.1