From 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 16 Jul 2021 16:16:43 +0100
Subject: Update GEMM assembly kernels

- Introduce Fp32 kernels with internal calculations in Bfloat16 when
fast_mode is enabled
- Improve kernel selection heuristics

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/Types.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arm_compute/core/Types.h')

diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index f6658e7544..9c00cbc88c 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1948,6 +1948,7 @@ public:
           _reinterpret_input_as_3d(false),
           _retain_internal_weights(false),
           _gemmlowp_output_stage(),
+          _fast_math(false),
           _fp_mixed_precision(false),
           _broadcast_bias(false),
           _pretranpose_B(true),
@@ -1967,12 +1968,13 @@ public:
      * @param[in] retain_internal_weights     (Optional) Retain the weights tensor from previous run
      * @param[in] gemmlowp_output_stage       (Optional) GEMMLowp Output stage info
      * @param[in] fp_mixed_precision          (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] fast_math                   (Optional) Use a data type of shorter width to improve performance
      * @param[in] broadcast_bias              (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
      * @param[in] constant_weights            (Optional) Weights have constant values throughout multiple executions
      */
     GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
-             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false,
+             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false,
              const ActivationLayerInfo &activation_info = ActivationLayerInfo(), bool constant_weights = true) noexcept
         : _is_a_reshaped(is_a_reshaped),
           _is_b_reshaped(is_b_reshaped),
@@ -1981,6 +1983,7 @@ public:
           _reinterpret_input_as_3d(reinterpret_input_as_3d),
           _retain_internal_weights(retain_internal_weights),
           _gemmlowp_output_stage(gemmlowp_output_stage),
+          _fast_math(fast_math),
           _fp_mixed_precision(fp_mixed_precision),
           _broadcast_bias(broadcast_bias),
           _pretranpose_B(reshape_b_only_on_first_run),
@@ -2062,6 +2065,14 @@ public:
     {
         return _fp_mixed_precision;
     };
+    /** Flag which specifies if a shorter accumulator to be used.
+     *
+     * @return True if a shorter accumulator has to be used
+     */
+    bool fast_math() const
+    {
+        return _fast_math;
+    };
     /** Flag which specifies whether to broadcast the shape of the bias tensor.
      *
      * @return True if the shape of the bias tensor is to be broadcasted.
@@ -2119,6 +2130,7 @@ private:
     bool                    _reinterpret_input_as_3d;
     bool                    _retain_internal_weights;
     GEMMLowpOutputStageInfo _gemmlowp_output_stage;
+    bool                    _fast_math;
     bool                    _fp_mixed_precision;
     bool                    _broadcast_bias;
     bool                    _pretranpose_B;
-- 
cgit v1.2.1