From 4aed4aafa2ddb0b6f4b76aef5008c8bb45599ea4 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 7 Aug 2020 15:36:30 +0100 Subject: COMPMID-3683: Fix performance regression on Mali-G76 (Fully connected) COMPMID-3682: Fix performance regression on Mali-G76 (Convolution) Updated the heuristic for GEMMReshapedOnlYRHS for Mali-G76 in order to take into account small workload cases Change-Id: I99fccbd0e94e4e21c0d1b88e23f02af06ef16ee9 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3689 Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- ...MMReshapedOnlyRHSKernelConfigurationBifrost.cpp | 35 ++++++++++++++++------ 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp index 581c2d2199..f9b65dc931 100644 --- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp +++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp @@ -149,34 +149,51 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; + const bool is_workload_big = ((m * n * b) / 16) >= 2048; // Get lhs_info/rhs_info in case of OpenCL buffer if(m == 1) { - const unsigned int h0 = std::max(n / 2, 1U); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); + if((n / 4) >= 2048) + { + const unsigned int h0 = std::max(n / 4, 1U); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); + } } else { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); + if(is_workload_big) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); + } } // Get lhs_info/rhs_info in case of OpenCL image - if(m == 1) + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); + if(is_workload_big) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 4, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); } else { - const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); } const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); - // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d - const bool use_cl_image2d = (m == 1 && n <= 4096) ? false : true; + // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d + const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) { -- cgit v1.2.1