diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-07-16 16:16:43 +0100 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-07-22 02:25:50 +0000 |
commit | 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 (patch) | |
tree | 2f8362d33cdad4212f4b96995681c68184c759e1 /arm_compute | |
parent | 59fd7a722e5bc7e85309d6200bc37a772721a719 (diff) | |
download | ComputeLibrary-4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2.tar.gz |
Update GEMM assembly kernels
- Introduce Fp32 kernels with internal calculations in Bfloat16 when
fast_mode is enabled
- Improve kernel selection heuristics
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r-- | arm_compute/core/CPP/CPPTypes.h | 8 | ||||
-rw-r--r-- | arm_compute/core/Types.h | 14 |
2 files changed, 18 insertions, 4 deletions
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index 4484271d63..76378d27ef 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -34,13 +34,15 @@ namespace arm_compute X(GENERIC) \ X(GENERIC_FP16) \ X(GENERIC_FP16_DOT) \ - X(A35) \ X(A53) \ X(A55r0) \ X(A55r1) \ + X(A35) \ X(A73) \ - X(KLEIN) \ - X(X1) + X(A510) \ + X(X1) \ + X(V1) \ + X(A64FX) /** CPU models types * diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index f6658e7544..9c00cbc88c 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1948,6 +1948,7 @@ public: _reinterpret_input_as_3d(false), _retain_internal_weights(false), _gemmlowp_output_stage(), + _fast_math(false), _fp_mixed_precision(false), _broadcast_bias(false), _pretranpose_B(true), @@ -1967,12 +1968,13 @@ public: * @param[in] retain_internal_weights (Optional) Retain the weights tensor from previous run * @param[in] gemmlowp_output_stage (Optional) GEMMLowp Output stage info * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. + * @param[in] fast_math (Optional) Use a data type of shorter width to improve performance * @param[in] broadcast_bias (Optional) Broadcast the shape of the bias tensor from a vector to a matrix. * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication * @param[in] constant_weights (Optional) Weights have constant values throughout multiple executions */ GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false, - GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false, + GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo(), bool constant_weights = true) noexcept : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), @@ -1981,6 +1983,7 @@ public: _reinterpret_input_as_3d(reinterpret_input_as_3d), _retain_internal_weights(retain_internal_weights), _gemmlowp_output_stage(gemmlowp_output_stage), + _fast_math(fast_math), _fp_mixed_precision(fp_mixed_precision), _broadcast_bias(broadcast_bias), _pretranpose_B(reshape_b_only_on_first_run), @@ -2062,6 +2065,14 @@ public: { return _fp_mixed_precision; }; + /** Flag which specifies if a shorter accumulator to be used. + * + * @return True if a shorter accumulator has to be used + */ + bool fast_math() const + { + return _fast_math; + }; /** Flag which specifies whether to broadcast the shape of the bias tensor. * * @return True if the shape of the bias tensor is to be broadcasted. @@ -2119,6 +2130,7 @@ private: bool _reinterpret_input_as_3d; bool _retain_internal_weights; GEMMLowpOutputStageInfo _gemmlowp_output_stage; + bool _fast_math; bool _fp_mixed_precision; bool _broadcast_bias; bool _pretranpose_B; |