aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp')
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp39
1 files changed, 26 insertions, 13 deletions
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 5d2d13e243..5c6f5b4ed0 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -35,11 +35,11 @@ using namespace arm_compute;
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
- _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true)
+ _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output)
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
@@ -47,9 +47,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- _a_offset = a->info()->quantization_info().offset;
- _b_offset = b->info()->quantization_info().offset;
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _a_offset = a->info()->quantization_info().offset;
+ _b_offset = b->info()->quantization_info().offset;
// If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
_is_interleaved_transposed = a->info()->dimension(1) > 16;
@@ -93,7 +96,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
if(_a_offset != 0)
{
TensorShape shape_vector_sum_col = b->info()->tensor_shape();
- if(b->info()->num_dimensions() > 1)
+
+ if(shape_vector_sum_col.num_dimensions() > 1)
{
shape_vector_sum_col.remove_dimension(1);
}
@@ -152,8 +156,21 @@ void CLGEMMLowpMatrixMultiplyCore::run()
// Run reshape matrix A
CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
- // Run reshape matrix B
- CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+ if(_is_first_run || !_reshape_b_only_on_first_run)
+ {
+ // Run reshape matrix B
+ CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+ }
+ }
+
+ // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
+ if(_is_first_run || !_reshape_b_only_on_first_run)
+ {
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+ }
}
// Run matrix multiply
@@ -165,14 +182,10 @@ void CLGEMMLowpMatrixMultiplyCore::run()
CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
}
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
- {
- CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
- }
-
// Run offset contribution kernel
CLScheduler::get().enqueue(_offset_contribution_kernel, true);
_memory_group.release();
+
+ _is_first_run = false;
}