COMPMID-748 - Integrating optimized SGEMM for bifrost

This patch introduces a new GEMM capable to improve the mac utilisation of 10% compared to the GEMM without reshape. However this implementation is not faster in all cases as we need to take into account the time for reshaping the matrices. For this reason an heuristic solution to select the optimal GEMM to use has been added to the function. More information about the heuristic implementation can be found at COMPMID-852. With this new patch, GoogleNet, MobileNet, VGG16 and SqueezeNet can improved the performance of 1.5x. More information about the performance uplift can be found here: https://confluence.arm.com/display/MLENG/GEMM+FP32+performance%3A+ACL+18.02 Change-Id: I024563c06b9aed02a211a974e452bae5c233b04c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117140 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Gian Marco <gianmarco.iodice@arm.com> 2018-01-12 10:21:40 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:44:21 +0000
commit: 36a0a4608bf413fc1fd65eb335bfb736ef602149 (patch)
tree: 2ff0e35dc9e16fedd601b1f24bdc13d25d075b90 /src/runtime/CL/functions/CLGEMM.cpp
parent: 46edf63bd630f5e3f3eb31b7d4602caa317da075 (diff)
download: ComputeLibrary-36a0a4608bf413fc1fd65eb335bfb736ef602149.tar.gz
1 files changed, 51 insertions, 9 deletions
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index c676a10978..a09849ab93 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,6 +38,30 @@
 
 using namespace arm_compute;
 
+namespace
+{
+inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+    bool flag = true;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        // COMPMID-852
+        if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+        {
+            const float scale = k < 1024 ? 2.0f : 2.5f;
+            flag              = scale * n > 1.66f * n + 38.4f;
+        }
+        else
+        {
+            flag = false;
+        }
+    }
+
+    return flag;
+}
+} // namespace
+
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
       _is_first_run(true), _reshape_b_only_on_first_run(false)
@@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
 
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
-    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    // For Bifrost architectures we do not reshape the input matrices
-    _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST);
-
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
 
-    // Set the target for the matrix multiply kernel
-    _mm_kernel.set_target(CLScheduler::get().target());
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _interleave_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int m                         = a->info()->dimension(1);
+    const int n                         = b->info()->dimension(0);
+    const int k                         = a->info()->dimension(0);
+    int       mult_transpose1xW_width   = 1;
+    int       mult_interleave4x4_height = 1;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        mult_transpose1xW_width   = 4;
+        mult_interleave4x4_height = 2;
+    }
+
+    // Check if we need to reshape the matrix A and matrix B
+    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
     if(_is_interleaved_transposed)
     {
@@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
+        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
 
         // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
+        _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
 
         // Manage intermediate buffers
         _memory_group.manage(&_tmp_a);
         _memory_group.manage(&_tmp_b);
     }
 
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
 
     if(_is_interleaved_transposed)
     {
author	Gian Marco <gianmarco.iodice@arm.com>	2018-01-12 10:21:40 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:44:21 +0000
commit	36a0a4608bf413fc1fd65eb335bfb736ef602149 (patch)
tree	2ff0e35dc9e16fedd601b1f24bdc13d25d075b90 /src/runtime/CL/functions/CLGEMM.cpp
parent	46edf63bd630f5e3f3eb31b7d4602caa317da075 (diff)
download	ComputeLibrary-36a0a4608bf413fc1fd65eb335bfb736ef602149.tar.gz