From ebcebf1dee7f8314976b1e0cabd62b4cf893d765 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Wed, 21 Oct 2020 00:04:14 +0100 Subject: COMPMID-3638: Move NEON kernels Signed-off-by: Michalis Spyrou Change-Id: Ieed3e4bc8be7fef80c90c5094599b477a56fc473 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4285 Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 75 ++++++++++++++-------- 1 file changed, 48 insertions(+), 27 deletions(-) (limited to 'src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp') diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 36357dde41..9050427b34 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -34,12 +34,23 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/TensorAllocator.h" #include "src/core/helpers/AutoConfiguration.h" + +#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" +#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" +#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" + #include "support/MemorySupport.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; +NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default; + NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, IWeightsManager *weights_manager) : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), @@ -80,7 +91,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction))); _memory_group.manage(&_signed_a); - _convert_to_signed_asymm.configure(a_to_use, &_signed_a); + _convert_to_signed_asymm = arm_compute::support::cpp14::make_unique(); + _convert_to_signed_asymm->configure(a_to_use, &_signed_a); a_to_use = &_signed_a; _a_offset = _signed_a.info()->quantization_info().uniform().offset; @@ -153,10 +165,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, } // Configure interleave kernel - _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a); + _mtx_a_reshape_kernel = arm_compute::support::cpp14::make_unique(); + _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a); // Configure transpose kernel - _mtx_b_reshape_kernel.configure(b, &_tmp_b); + _mtx_b_reshape_kernel = arm_compute::support::cpp14::make_unique(); + _mtx_b_reshape_kernel->configure(b, &_tmp_b); } if(!_fused_assembly_path) @@ -176,7 +190,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel = arm_compute::support::cpp14::make_unique(); + _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -188,7 +203,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _memory_group.manage(&_vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info); + _mtx_a_reduction_kernel = arm_compute::support::cpp14::make_unique(); + _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); } if(_fuse_output_stage) @@ -196,19 +212,22 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, // Configure matrix multiply kernel if(!_assembly_path) { - _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32); + _mm_kernel = arm_compute::support::cpp14::make_unique(); + _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); } - _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : output, - a->info()->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); + _offset_contribution_output_stage_kernel = arm_compute::support::cpp14::make_unique(); + _offset_contribution_output_stage_kernel->configure(&_mm_result_s32, + _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, + _flip_signedness ? &_signed_output : output, + a->info()->dimension(0), + _a_offset, _b_offset, info.gemmlowp_output_stage()); if(_flip_signedness) { - _convert_from_signed_asymm.configure(&_signed_output, output); + _convert_from_signed_asymm = arm_compute::support::cpp14::make_unique(); + _convert_from_signed_asymm->configure(&_signed_output, output); } } else @@ -216,10 +235,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, // Configure matrix multiply kernel if(!_assembly_path) { - _mm_kernel.configure(matrix_a, matrix_b, output); + _mm_kernel = arm_compute::support::cpp14::make_unique(); + _mm_kernel->configure(matrix_a, matrix_b, output); } // Configure offset contribution kernel - _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset); + _offset_contribution_kernel = arm_compute::support::cpp14::make_unique(); + _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset); } // Configure activation @@ -468,7 +489,7 @@ void NEGEMMLowpMatrixMultiplyCore::run() // Convert QASYMM8->QASYMM8_SIGNED if(_flip_signedness) { - NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY); + NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY); } // Run GEMM @@ -481,15 +502,15 @@ void NEGEMMLowpMatrixMultiplyCore::run() if(!_run_vector_matrix_multiplication) { // Run interleave kernel - NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY); + NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); if(!_reshape_b_only_on_first_run) { // Run transpose kernel - NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY); + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); } } - NEScheduler::get().schedule(&_mm_kernel, Window::DimY); + NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); } if(!_fused_assembly_path) @@ -497,31 +518,31 @@ void NEGEMMLowpMatrixMultiplyCore::run() // Run matrix A reduction kernel only if _b_offset is not equal to 0 if(_b_offset != 0) { - NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); + NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 if(_a_offset != 0 && !_reshape_b_only_on_first_run) { - NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX); } if(_fuse_output_stage) { // Run offset contribution kernel - NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY); + NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY); } else { // Run offset contribution kernel - NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); + NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY); } } // Convert QASYMM8_SIGNED->QASYMM8 - if(_flip_signedness) + if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness) { - NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY); + NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY); } // Run fused activation unless already run in the fused assembly @@ -560,7 +581,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare() // Run reshape kernel and mark original weights tensor as unused _tmp_b.allocator()->allocate(); - NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY); + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); if(!original_b_managed_by_weights_manager) { _original_b->mark_as_unused(); @@ -571,7 +592,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare() if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); - NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX); } _is_prepared = true; -- cgit v1.2.1