1 files changed, 51 insertions, 9 deletions
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index c676a10978..a09849ab93 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,6 +38,30 @@
 
 using namespace arm_compute;
 
+namespace
+{
+inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+    bool flag = true;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        // COMPMID-852
+        if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+        {
+            const float scale = k < 1024 ? 2.0f : 2.5f;
+            flag              = scale * n > 1.66f * n + 38.4f;
+        }
+        else
+        {
+            flag = false;
+        }
+    }
+
+    return flag;
+}
+} // namespace
+
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
       _is_first_run(true), _reshape_b_only_on_first_run(false)
@@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
 
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
-    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    // For Bifrost architectures we do not reshape the input matrices
-    _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST);
-
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
 
-    // Set the target for the matrix multiply kernel
-    _mm_kernel.set_target(CLScheduler::get().target());
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _interleave_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int m                         = a->info()->dimension(1);
+    const int n                         = b->info()->dimension(0);
+    const int k                         = a->info()->dimension(0);
+    int       mult_transpose1xW_width   = 1;
+    int       mult_interleave4x4_height = 1;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        mult_transpose1xW_width   = 4;
+        mult_interleave4x4_height = 2;
+    }
+
+    // Check if we need to reshape the matrix A and matrix B
+    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
     if(_is_interleaved_transposed)
     {
@@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
+        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
 
         // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
+        _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
 
         // Manage intermediate buffers
         _memory_group.manage(&_tmp_a);
         _memory_group.manage(&_tmp_b);
     }
 
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
 
     if(_is_interleaved_transposed)
     {