1 files changed, 26 insertions, 13 deletions
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 5d2d13e243..5c6f5b4ed0 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -35,11 +35,11 @@ using namespace arm_compute;
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
-      _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true)
+      _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output)
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
@@ -47,9 +47,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
     ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
     ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
-    _a_offset = a->info()->quantization_info().offset;
-    _b_offset = b->info()->quantization_info().offset;
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _a_offset                    = a->info()->quantization_info().offset;
+    _b_offset                    = b->info()->quantization_info().offset;
 
     // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
     _is_interleaved_transposed = a->info()->dimension(1) > 16;
@@ -93,7 +96,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
     if(_a_offset != 0)
     {
         TensorShape shape_vector_sum_col = b->info()->tensor_shape();
-        if(b->info()->num_dimensions() > 1)
+
+        if(shape_vector_sum_col.num_dimensions() > 1)
         {
             shape_vector_sum_col.remove_dimension(1);
         }
@@ -152,8 +156,21 @@ void CLGEMMLowpMatrixMultiplyCore::run()
         // Run reshape matrix A
         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
 
-        // Run reshape matrix B
-        CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+        if(_is_first_run || !_reshape_b_only_on_first_run)
+        {
+            // Run reshape matrix B
+            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+        }
+    }
+
+    // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
+    if(_is_first_run || !_reshape_b_only_on_first_run)
+    {
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if(_a_offset != 0)
+        {
+            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+        }
     }
 
     // Run matrix multiply
@@ -165,14 +182,10 @@ void CLGEMMLowpMatrixMultiplyCore::run()
         CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
     }
 
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
-    {
-        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
-    }
-
     // Run offset contribution kernel
     CLScheduler::get().enqueue(_offset_contribution_kernel, true);
 
     _memory_group.release();
+
+    _is_first_run = false;
 }