From 48b3ef89de5f21a0169d8416e3d54081f82c7bf8 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 14 Oct 2019 19:03:09 +0100 Subject: COMPMID-2577: Fuse bias addition and activation in gemm assembly kernels Change-Id: I7f52112d2d05b1ea3d3f3d4b19b8eafab05d6c44 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/2141 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez --- src/runtime/NEON/functions/NEGEMM.cpp | 100 +++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 19 deletions(-) (limited to 'src/runtime/NEON/functions/NEGEMM.cpp') diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index df92b7999c..baa22b7d32 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -34,7 +34,6 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" #include "arm_compute/runtime/TensorAllocator.h" -#include "support/ToolchainSupport.h" #include @@ -43,8 +42,9 @@ using namespace arm_compute::misc::shape_calculator; namespace arm_compute { NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(), _tmp_a(), - _tmp_b(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false) + : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(), + _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), + _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -52,34 +52,55 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe { ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); + const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); + bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info)); + // Check if we need to reshape the matrix B only on the first run _is_prepared = false; _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; _original_b = b; - - bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); + _run_alpha_scale = alpha != 1.f; + _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run(); + _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run(); + _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !NEGEMMAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); if(run_optimised) { + const ITensor *c_to_use = is_c_bias ? c : nullptr; if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE) { GEMMInfo gemm_info_ntb = gemm_info; gemm_info_ntb.set_pretranpose_B(false); - _asm_glue.configure(a, b, c, d, alpha, beta, gemm_info_ntb); + _asm_glue.configure(a, b, c_to_use, d, gemm_info_ntb); } else { - _asm_glue.configure(a, b, c, d, alpha, beta, gemm_info); + _asm_glue.configure(a, b, c_to_use, d, gemm_info); } ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured()); + + // Scale product by alpha + if(_run_alpha_scale) + { + _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); + } } else { + // Pick output tensor in case bias addition should be performed + ITensor *gemm_output_to_use = d; + if(_run_bias_addition) + { + gemm_output_to_use = &_tmp_d; + _memory_group.manage(&_tmp_d); + } + + // Select between GEMV and GEMM if(_run_vector_matrix_multiplication) { // Configure the matrix multiply kernel - _mm_kernel.configure(a, b, d, alpha, false); + _mm_kernel.configure(a, b, gemm_output_to_use, alpha, false); } else { @@ -117,7 +138,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe _transpose_kernel.configure(b, &_tmp_b); // Configure matrix multiplication kernel - _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha, true, GEMMReshapeInfo(m, n, k)); + _mm_kernel.configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); // Allocate once the all configure methods have been called _tmp_a.allocator()->allocate(); @@ -127,18 +148,31 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe } } - // Configure matrix addition kernel - if(beta != 0 && c != nullptr) + if(_run_bias_addition) { - _ma_kernel.configure(c, d, beta); - _run_addition = true; + _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); + _tmp_d.allocator()->allocate(); } } + + // Configure matrix addition kernel + if(_run_addition) + { + _ma_kernel.configure(c, d, beta); + } + + // Configure activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + if(_run_activation) + { + _activation_func.configure(d, nullptr, activation); + } } Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); + const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32); @@ -147,7 +181,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - if(c != nullptr) + if(c != nullptr && !is_c_bias) { ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); @@ -178,7 +212,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso } // Check if we need to run the optimized assembly kernel - const bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, alpha, beta, gemm_info)); + const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info)); if(!run_optimised) { @@ -225,14 +259,26 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso // Validate matrix multiply auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); + + if(c != nullptr && gemm_info.reshape_b_only_on_first_run()) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE)); + } } // Validate matrix addition kernel - if(beta != 0 && c != nullptr) + if(beta != 0 && c != nullptr && !is_c_bias) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta)); } + // Validate activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + if(activation.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation)); + } + return Status{}; } @@ -245,6 +291,10 @@ void NEGEMM::run() if(_asm_glue.is_configured()) { _asm_glue.run(); + if(_run_alpha_scale) + { + _alpha_scale_func.run(); + } } else { @@ -262,12 +312,24 @@ void NEGEMM::run() NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY); - // Run matrix addition kernel - if(_run_addition) + // Run bias addition kernel + if(_run_bias_addition) { - NEScheduler::get().schedule(&_ma_kernel, Window::DimY); + NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY); } } + + // Run matrix addition kernel + if(_run_addition) + { + NEScheduler::get().schedule(&_ma_kernel, Window::DimY); + } + + // Run activation function + if(_run_activation) + { + _activation_func.run(); + } } void NEGEMM::prepare() -- cgit v1.2.1