From 1d25ed54a948639d1894c8b021940df70005d519 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Sat, 16 Dec 2017 19:33:50 +0000 Subject: COMPMID-759 - CLGEMM optimization for McVail benchmarks This patch introduces an optimization for CLGEMM on Bifrost architectures which can bring to 40% of FMA utilization on config 3 of McVail. The new CLGEMM does not require any reshape of matrix A and matrix B. This patch also adds the auto-config in CLConvolutionLayer and CLGEMM and extends the interface for NEGEMM and CLGEMM. Change-Id: Ibb354eda45e9ca64b14a99700fb21dff5989dda9 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/113716 Tested-by: Jenkins Reviewed-by: Michalis Spyrou Reviewed-by: Anthony Barbier --- src/runtime/NEON/functions/NEGEMM.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'src/runtime/NEON/functions/NEGEMM.cpp') diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 03ba43f901..e640b0604c 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -50,15 +50,17 @@ namespace arm_compute { NEGEMM::NEGEMM(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), - _run_vector_matrix_multiplication(false), _run_addition(false) + _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) { } -void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta) +void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d); ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); if(c != nullptr) { @@ -70,6 +72,8 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix"); } + // Check if we need to reshape the matrix B only on the first run + _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; // Check if the first input tensor is a vector. @@ -207,8 +211,18 @@ void NEGEMM::run() // Run interleave kernel NEScheduler::get().schedule(&_interleave_kernel, Window::DimY); - // Run transpose kernel - NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); + if(_is_first_run) + { + // Run transpose kernel + NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); + + _is_first_run = false; + } + else if(!_reshape_b_only_on_first_run) + { + // Run transpose kernel + NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); + } } NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY); -- cgit v1.2.1