diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2023-04-28 10:40:07 +0100 |
---|---|---|
committer | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2023-05-02 15:53:28 +0000 |
commit | 60ab4e66ea3cb85042035fd1aafbfea666bb4ea7 (patch) | |
tree | 05818f7fafb0cf02d337b201756548152090436f | |
parent | d7113e4af5b5497d3a3a62dc9cf6b147e2a024cd (diff) | |
download | ComputeLibrary-60ab4e66ea3cb85042035fd1aafbfea666bb4ea7.tar.gz |
Fix export_to_cl_image issue in the fp16 GeMM implementation
- The issue affects Fp16 GeMM on Arm® Mali™-G78
- The issue was caused by a missing fallback implementation for the
case when export_to_cl_image cannot be used
- The new implementation fixes this issues and make the GeMM
implementation for M=1 also faster (4-5% on various networks with fully
connected at the end of the model)
- This patch also enables the H0=0 case in the GeMM examples
Resolves COMPMID-5812, COMPMID-5688, and COMPMID-6147
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Change-Id: Ib7b355ae25337962598dd2ba21665b1a6b48686f
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/514664
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9526
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
5 files changed, 36 insertions, 18 deletions
diff --git a/examples/gemm_tuner/cl_gemm_reshaped.cpp b/examples/gemm_tuner/cl_gemm_reshaped.cpp index bbc42a7e55..59044477bf 100644 --- a/examples/gemm_tuner/cl_gemm_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemm_reshaped.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -256,6 +256,11 @@ public: kernel_info.broadcast_bias = true; kernel_info.activation_info = act_info; + if(rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U); + } + // Initialise lhs_reshaped tensor info lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); diff --git a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp index 79f251ae7d..0ad2a65dc2 100644 --- a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp +++ b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -225,6 +225,11 @@ public: kernel_info.broadcast_bias = true; kernel_info.activation_info = act_info; + if(rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U); + } + // Initialise rhs_reshaped tensor info rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp index 115ce6ba23..9cf9c9fed0 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -240,6 +240,11 @@ public: rhs_info.transpose = configs.transpose_rhs; rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet + if(rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U); + } + lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); lhs_reshaped.info()->set_quantization_info(q_info); diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp index 0e0917daa9..94f3c93166 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -220,6 +220,11 @@ public: rhs_info.transpose = configs.transpose_rhs; rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet + if(rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U); + } + rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); rhs_reshaped.info()->set_quantization_info(q_info); if(rhs_info.export_to_cl_image) diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp index 76551b076a..d08bf84c72 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp @@ -562,21 +562,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn if(m == 1) { - if(r_mn <= 0.0045f) + const GeMMConfigsMatrix configs_mnkb_best = { - if(workload <= 278.7000f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 0, 0, 1, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 32, 0, 0, 1, 0, 0); - } - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 0, 1, 0, 0); - } + { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 }, + { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, + { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, + { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 }, + { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 }, + { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, + { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, + { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 } + }; + + return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b); } else { |