From 5124be5d1caa70964d452cf9a8cc7c67df31fa9d Mon Sep 17 00:00:00 2001 From: Chunosov Date: Wed, 22 Nov 2017 20:42:13 +0700 Subject: COMPMID-661: Convolution quantized (#32) Change-Id: Id69df4ce98d1d89bdf9c9aa5c4d909659909b30f Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110456 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Georgios Pinitas Reviewed-by: Anthony Barbier --- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 39 ++++++++++++++-------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp') diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 5d2d13e243..5c6f5b4ed0 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -35,11 +35,11 @@ using namespace arm_compute; CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), - _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true) + _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false) { } -void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output) +void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); @@ -47,9 +47,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); + ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - _a_offset = a->info()->quantization_info().offset; - _b_offset = b->info()->quantization_info().offset; + _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _a_offset = a->info()->quantization_info().offset; + _b_offset = b->info()->quantization_info().offset; // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors _is_interleaved_transposed = a->info()->dimension(1) > 16; @@ -93,7 +96,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_a_offset != 0) { TensorShape shape_vector_sum_col = b->info()->tensor_shape(); - if(b->info()->num_dimensions() > 1) + + if(shape_vector_sum_col.num_dimensions() > 1) { shape_vector_sum_col.remove_dimension(1); } @@ -152,8 +156,21 @@ void CLGEMMLowpMatrixMultiplyCore::run() // Run reshape matrix A CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false); - // Run reshape matrix B - CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); + if(_is_first_run || !_reshape_b_only_on_first_run) + { + // Run reshape matrix B + CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); + } + } + + // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once + if(_is_first_run || !_reshape_b_only_on_first_run) + { + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0) + { + CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); + } } // Run matrix multiply @@ -165,14 +182,10 @@ void CLGEMMLowpMatrixMultiplyCore::run() CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false); } - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); - } - // Run offset contribution kernel CLScheduler::get().enqueue(_offset_contribution_kernel, true); _memory_group.release(); + + _is_first_run = false; } -- cgit v1.2.1