From 36a0a4608bf413fc1fd65eb335bfb736ef602149 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Fri, 12 Jan 2018 10:21:40 +0000 Subject: COMPMID-748 - Integrating optimized SGEMM for bifrost This patch introduces a new GEMM capable to improve the mac utilisation of 10% compared to the GEMM without reshape. However this implementation is not faster in all cases as we need to take into account the time for reshaping the matrices. For this reason an heuristic solution to select the optimal GEMM to use has been added to the function. More information about the heuristic implementation can be found at COMPMID-852. With this new patch, GoogleNet, MobileNet, VGG16 and SqueezeNet can improved the performance of 1.5x. More information about the performance uplift can be found here: https://confluence.arm.com/display/MLENG/GEMM+FP32+performance%3A+ACL+18.02 Change-Id: I024563c06b9aed02a211a974e452bae5c233b04c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117140 Reviewed-by: Pablo Tello Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/runtime/CL/functions/CLGEMM.cpp | 60 +++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 9 deletions(-) (limited to 'src/runtime/CL/functions/CLGEMM.cpp') diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index c676a10978..a09849ab93 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -38,6 +38,30 @@ using namespace arm_compute; +namespace +{ +inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target) +{ + bool flag = true; + + if(gpu_target == GPUTarget::BIFROST) + { + // COMPMID-852 + if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run) + { + const float scale = k < 1024 ? 2.0f : 2.5f; + flag = scale * n > 1.66f * n + 38.4f; + } + else + { + flag = false; + } + } + + return flag; +} +} // namespace + CLGEMM::CLGEMM(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) @@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors - // For Bifrost architectures we do not reshape the input matrices - _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST); - // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; - // Set the target for the matrix multiply kernel - _mm_kernel.set_target(CLScheduler::get().target()); + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + + // Set the target for the kernels + _interleave_kernel.set_target(gpu_target); + _mm_kernel.set_target(gpu_target); + + // Arguments used by GEMMReshapeInfo + // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo + // in order to know how the matrices have been reshaped + const int m = a->info()->dimension(1); + const int n = b->info()->dimension(0); + const int k = a->info()->dimension(0); + int mult_transpose1xW_width = 1; + int mult_interleave4x4_height = 1; + + if(gpu_target == GPUTarget::BIFROST) + { + mult_transpose1xW_width = 4; + mult_interleave4x4_height = 2; + } + + // Check if we need to reshape the matrix A and matrix B + _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target); if(_is_interleaved_transposed) { @@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel // Configure interleave kernel - _interleave_kernel.configure(a, &_tmp_a); + _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height); // Configure transpose kernel - _transpose_kernel.configure(b, &_tmp_b); + _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width); // Manage intermediate buffers _memory_group.manage(&_tmp_a); _memory_group.manage(&_tmp_b); } - _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed); + _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height)); if(_is_interleaved_transposed) { -- cgit v1.2.1