From 48b3ef89de5f21a0169d8416e3d54081f82c7bf8 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Mon, 14 Oct 2019 19:03:09 +0100
Subject: COMPMID-2577: Fuse bias addition and activation in gemm assembly
 kernels

Change-Id: I7f52112d2d05b1ea3d3f3d4b19b8eafab05d6c44
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2141
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
---
 arm_compute/runtime/NEON/functions/NEGEMM.h | 36 +++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'arm_compute/runtime/NEON/functions/NEGEMM.h')
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index d947be1ef9..e4d69eb93d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_NEGEMM_H__
 #define __ARM_COMPUTE_NEGEMM_H__
 
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
@@ -33,20 +34,27 @@
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/Tensor.h"
 
-#include <memory>
-
 namespace arm_compute
 {
 /** Basic function to execute GEMM on NEON. This function calls the following NEON kernels:
  *
+ * If optimized assembly is available:
+ *  -# @ref NEGEMMAssemblyDispatch
+ *  -# @ref NEActivationLayer (if alpha != 1.0)
+ * Else:
  *  -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix)
  *  -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix)
  *  -# @ref NEGEMMMatrixMultiplyKernel
- *  -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
+ * In both cases:
+ *  -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once)
+ * Else:
+ *  -# @ref NEArithmeticAdditionKernel (if c != nullptr and is reshaped once and not optimized assembly in place)
  *
+ *  -# @ref NEActivationLayer (if activation is specified in GEMMInfo)
  */
 class NEGEMM : public IFunction
 {
@@ -103,13 +111,21 @@ private:
     NEGEMMMatrixMultiplyKernel _mm_kernel;
     NEGEMMAssemblyDispatch     _asm_glue;
     NEGEMMMatrixAdditionKernel _ma_kernel;
-    Tensor                     _tmp_a;
-    Tensor                     _tmp_b;
-    const ITensor             *_original_b;
-    bool                       _run_vector_matrix_multiplication;
-    bool                       _run_addition;
-    bool                       _reshape_b_only_on_first_run;
-    bool                       _is_prepared;
+    NEActivationLayer          _alpha_scale_func;
+    NEArithmeticAdditionKernel _add_bias_kernel;
+    NEActivationLayer          _activation_func;
+
+    Tensor         _tmp_a;
+    Tensor         _tmp_b;
+    Tensor         _tmp_d;
+    const ITensor *_original_b;
+    bool           _run_vector_matrix_multiplication;
+    bool           _run_alpha_scale;
+    bool           _run_addition;
+    bool           _run_bias_addition;
+    bool           _run_activation;
+    bool           _reshape_b_only_on_first_run;
+    bool           _is_prepared;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEGEMM_H__ */
-- 
cgit v1.2.1