From c6eaec3610fa27651582f6c1acad35afffe360f6 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 20 Jul 2020 13:31:05 +0100 Subject: COMPMID-3326; Update heuristic for GEMMReshaped and GEMMReshapedOnlyRHS - Update the heuristic for Arm Mali-G77 (F32) in order to use the OpenCL image2d object on GEMM Change-Id: Ife6736a22ec2a114368bb338908f0c5f239dfad6 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3593 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio --- ...MMReshapedOnlyRHSKernelConfigurationValhall.cpp | 59 ++++++++++++++++++---- src/runtime/CL/functions/CLGEMM.cpp | 10 +++- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp index 11cb90ba19..9f3461e912 100644 --- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp +++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp @@ -27,6 +27,9 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/gemm/CLGEMMHelpers.h" #include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include #include @@ -35,6 +38,8 @@ namespace arm_compute { namespace cl_gemm { +using namespace arm_compute::misc::shape_calculator; + CLGEMMReshapedOnlyRHSKernelConfigurationValhall::CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu) : ICLGEMMKernelConfiguration(gpu) { @@ -74,32 +79,66 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi std::pair CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + // Get lhs_info/rhs_info in case of OpenCL buffer if(m == 1) { - if(n > 2048) + const unsigned int h0 = std::max(n / 4, 1U); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); + } + else + { + if(m > 256) { - return configure_lhs_rhs_info(m, n, 1, 8, 2, 1, 256, false, true, false, true); + const int v0 = std::max(std::min(static_cast(n / 4), static_cast(8)), static_cast(1)); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, v0, false, true, false, true); } else { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 256, false, true, false, true); + const int v0 = std::max(std::min(static_cast(n / 4), static_cast(8)), static_cast(1)); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, v0, false, true, false, true); } } + + // Get lhs_info/rhs_info in case of OpenCL image + if(m == 1) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 8, true, true, false, false, true); + } else { - if(m > 300) + if((m / 4) * (n / 4) > 4096) { - const int v0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, v0, false, true, false, true); + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(8)), static_cast(1)); + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); } else { - const int v0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, v0, false, true, false, true); + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(8)), static_cast(1)); + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, h0, false, true, false, false, true); } } + + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); + const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); + const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); + + // In case of small workloads, we use the OpenCL buffer rather than the OpenCL image2d + const bool use_cl_image2d = ((m / lhs_info_img.m0) * (n / rhs_info_img.n0)) * b < 1024 ? false : true; + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + { + return std::make_pair(lhs_info_img, rhs_info_img); + } + else + { + return std::make_pair(lhs_info_buf, rhs_info_buf); + } } std::pair CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) @@ -120,7 +159,7 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); } } - else if (m < 128) + else if(m < 128) { const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); if(k >= 512) diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 43eb736d40..4a74630036 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -291,8 +291,16 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context std::unique_ptr gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); + unsigned int m_internal = m; + unsigned int b_internal = batch_size; + if(reinterpret_input_as_3d) + { + m_internal = a->info()->dimension(1); + b_internal = a->info()->dimension(2); + } + // Configure lhs_info and rhs_info - std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); + std::tie(lhs_info, rhs_info) = gemm_config->configure(m_internal, n, k, b_internal, data_type); ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) -- cgit v1.2.1